diff --git a/R/dictionary.R b/R/dictionary.R index 57228d9d..dbb40a35 100644 --- a/R/dictionary.R +++ b/R/dictionary.R @@ -2,3 +2,5 @@ scaled_string = "_scaled" adjusted_string = "_adjusted" + +warning_for_scaling_with_few_genes = "tidybulk says: There are < 100 features/genes that are present in all you samples. Because edgeR::calcNormFactors does not allow NAs, the scaling is performed on that limited set of features.genes. The scaling could not be accurate, it is adivasble to perform impute_missing_abundance() before scaling. It is possible to filter the imputed counts after scaling." diff --git a/R/functions.R b/R/functions.R index e26d2942..18a9f915 100755 --- a/R/functions.R +++ b/R/functions.R @@ -3174,7 +3174,8 @@ fill_NA_using_formula = function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, - .abundance_scaled = NULL){ + .abundance_scaled = NUL, + suffix = "_imputed"){ # Get column names .sample = enquo(.sample) @@ -3194,7 +3195,6 @@ fill_NA_using_formula = function(.data, # Create NAs for missing sample/transcript pair - .data_completed = .data %>% @@ -3203,9 +3203,29 @@ fill_NA_using_formula = function(.data, mutate(ct_data = map(ct_data, ~ .x %>% droplevels() %>% complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>% unnest(ct_data) + # For non scaled counts create a pseudo scale based on library size, then calculate imputed and scale back + abundance_is_int = .data %>% slice(1) %>% pull(!!.abundance) %>% class() %>% equals("integer") + .data = + .data %>% + group_by(!!.sample) %>% + mutate(library_size__ = sum(!!.abundance)) %>% + ungroup() %>% + mutate(!!.abundance := !!.abundance / library_size__) + + imputed_column = sprintf("%s%s", quo_name(.abundance), suffix ) + imputed_column_scaled = sprintf("%s%s", quo_name(.abundance_scaled), suffix ) + + # Divide the dataset .data_OK = .data %>% - anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) + anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>% + + # Add the imputed column + mutate(!!as.symbol(imputed_column) := !!.abundance) %>% + when( + quo_is_symbol(.abundance_scaled) ~ .x %>% + mutate(!!as.symbol(imputed_column_scaled) := !!.abundance_scaled) + ) .data_FIXED = .data %>% @@ -3216,16 +3236,22 @@ fill_NA_using_formula = function(.data, .data_completed %>% filter(!!.abundance %>% is.na) %>% select(!!.sample, !!.transcript) %>% - left_join(.data %>% pivot_sample(!!.sample)) %>% - left_join(.data %>% pivot_transcript(!!.transcript)) - ) %>% + left_join(.data %>% pivot_sample(!!.sample), by = quo_name(.sample)) %>% + left_join(.data %>% pivot_transcript(!!.transcript), by = quo_name(.transcript)) + ) + + # Clean environment + rm(.data_completed) + gc() + + .data_FIXED %>% # Group by covariate nest(cov_data = -c(col_formula, !!.transcript)) %>% mutate(cov_data = map(cov_data, ~ .x %>% mutate( - !!.abundance := ifelse( + !!as.symbol(imputed_column) := ifelse( !!.abundance %>% is.na, median(!!.abundance, na.rm = TRUE),!!.abundance ) @@ -3235,7 +3261,7 @@ fill_NA_using_formula = function(.data, ifelse_pipe( quo_is_symbol(.abundance_scaled), ~ .x %>% mutate( - !!.abundance_scaled := ifelse( + !!as.symbol(imputed_column_scaled) := ifelse( !!.abundance_scaled %>% is.na, median(!!.abundance_scaled, na.rm = TRUE),!!.abundance_scaled ) @@ -3250,6 +3276,11 @@ fill_NA_using_formula = function(.data, .data_OK %>% bind_rows(.data_FIXED) %>% + # Scale back the pseudoscaling + mutate(!!.abundance := !!.abundance * library_size__) %>% + select(-library_size__) %>% + when(abundance_is_int ~ mutate(., !!.abundance := as.integer(!!.abundance)), ~ (.)) %>% + # Reattach internals reattach_internals(.data) @@ -3290,6 +3321,9 @@ fill_NA_using_value = function(.data, .feature = enquo(.transcript) .value = enquo(.abundance) + # Scale based on library size + + # Create NAs for missing element/feature pair df_to_impute = .data %>% diff --git a/R/methods.R b/R/methods.R index a9f92ee3..f0fa6588 100755 --- a/R/methods.R +++ b/R/methods.R @@ -3685,7 +3685,8 @@ setGeneric("impute_missing_abundance", function(.data, .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL) + .abundance = NULL, + suffix = "") standardGeneric("impute_missing_abundance")) # Set internal @@ -3693,7 +3694,8 @@ setGeneric("impute_missing_abundance", function(.data, .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL) + .abundance = NULL, + suffix = "") { # Get column names .sample = enquo(.sample) @@ -3724,7 +3726,8 @@ setGeneric("impute_missing_abundance", function(.data, .sample = !!.sample, .transcript = !!.transcript, .abundance = !!.abundance, - .abundance_scaled = !!.abundance_scaled) %>% + .abundance_scaled = !!.abundance_scaled, + suffix = suffix) %>% # Reattach internals reattach_internals(.data) diff --git a/R/methods_SE.R b/R/methods_SE.R index a1432700..96f269ab 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -151,8 +151,13 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) when(nrow(.) == 0 ~ stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data."), ~ (.)) my_assay = assays(.data_filtered) %>% as.list() %>% .[1] - my_counts_filtered = my_assay[[1]] - library_size_filtered = my_counts_filtered %>% colSums() + + # Drop genes with NAs, as edgeR::calcNormFactors does not accept them + my_counts_filtered = my_assay[[1]] %>% na.omit() + library_size_filtered = my_counts_filtered %>% colSums(na.rm = TRUE) + + # If not enough genes, warning + if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes) # Set column name for value scaled value_scaled = my_assay %>% names() %>% paste0(scaled_string) @@ -1315,7 +1320,8 @@ setMethod("keep_variable", edgeR::filterByExpr( min.count = minimum_counts, group = string_factor_of_interest, - min.prop = minimum_proportion + min.prop = minimum_proportion, + lib.size = colSums(., na.rm=TRUE) ) %>% not() %>% which %>% @@ -1956,89 +1962,82 @@ setMethod("pivot_transcript", .impute_missing_abundance_se = function(.data, - .formula) { + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + suffix = "") { + + .abundance = enquo(.abundance) + + .assay_to_impute = + .abundance %>% + when( + quo_is_symbolic(.) ~ assays(.data)[quo_names(.abundance)], + ~ assays(.data) + ) + # Split data by formula and impute + imputed_dataframe = + map2( + # Capture assay names as we need to know if scaled is in the name + as.list(.assay_to_impute), names(.assay_to_impute), + ~ { - col_formula = - colData(.data) %>% - as_tibble() %>% - select(parse_formula(.formula)) %>% - distinct() %>% - select_if(function(x) is.character(x) | is.logical(x) | is.factor(x)) %>% - colnames + # Pseudo-scale if not scaled + if(!grepl("_scaled", .y)) library_size = colSums(.x, na.rm = TRUE) + if(!grepl("_scaled", .y)) .x = .x / library_size - # Create NAs for missing sample/transcript pair - assays(.data) = - assays(.data) %>% - as.list() %>% - map(~{ - .my_data = - .x %>% - as.matrix() %>% - as_tibble(rownames = "transcript") %>% - gather(sample, abundance, -transcript) %>% - - # Attach annotation - left_join( - colData(.data) %>% - as_tibble(rownames="sample") %>% - select(sample, col_formula), - by="sample" - ) + # Log + need_log = max(.x, na.rm=T) > 50 + if(need_log) .x = log1p(.x) - # Data used for filtering - NA_data = - .my_data %>% - filter(abundance %>% is.na) %>% - select( transcript, col_formula) %>% - distinct() - - # If no missing just return the same matrix - if(nrow(NA_data) == 0) return(.x) - - .data_OK = - .my_data %>% - anti_join(NA_data, by = c("transcript", col_formula)) - - - .data_FIXED = - .my_data %>% - inner_join(NA_data, by = c("transcript", col_formula)) %>% - - # Group by covariate - nest(cov_data = -c(col_formula, transcript)) %>% - mutate(cov_data = map(cov_data, ~ - .x %>% - mutate(abundance = - case_when( - is.na(abundance) ~ median(abundance, na.rm=TRUE), - TRUE ~ abundance - ) - ) %>% - - # Through warning if group of size 1 - when( - nrow(.) %>% st(2) ~ warning("tidybulk says: According to your design matrix, u have sample groups of size < 2, so you your dataset could still be sparse."), - ~ (.) - ) - )) %>% - unnest(cov_data) - - .data_OK %>% - bind_rows(.data_FIXED) %>% - select(-col_formula) %>% - spread(sample, abundance) %>% - as_matrix(rownames = transcript) - - }) + # Imputation + .x = fill_NA_matrix_with_factor_colwise( + .x, + # I split according to the formula + colData(.data)[,parse_formula(.formula)] + ) - .data + # Exp back + if(need_log) .x = exp(.x)-1 + + # Scale back if pseudoscaled + if(!grepl("_scaled", .y)) .x = .x * library_size + + # Return + .x + } + ) %>% + + # Add imputed to the name + setNames(sprintf("%s%s", names(.assay_to_impute), suffix)) + + .assays_name_to_port = names(assays(.data)) %>% setdiff(names(.assay_to_impute)) + + assays(.data) = + as.list(assays(.data))[.assays_name_to_port] %>% + c(imputed_dataframe ) %>% + + # Add .imputed column + c(list(.imputed = which_NA_matrix(.assay_to_impute[[1]] ))) %>% + + # Make names unique + setNames(names(.) %>% make.unique()) + + + .data %>% + + # Reattach internals + reattach_internals(.data) } + + #' impute_missing_abundance #' @inheritParams impute_missing_abundance #' diff --git a/R/utilities.R b/R/utilities.R index 5c1f956d..4ce61b46 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -257,7 +257,7 @@ scale_design = function(df, .formula) { } get_tt_columns = function(.data){ - if(.data %>% attr("internals") %>% is.list()) + if(.data %>% attr("internals") %>% is.list() ) #& "internals" %in% (.data %>% attr("internals") %>% names())) .data %>% attr("internals") %$% tt_columns else NULL } @@ -1215,6 +1215,15 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, droplevels() %>% select(!!.sample, !!.transcript, !!.abundance) + df.filt.spread = + df.filt %>% + tidyr::spread(!!.sample,!!.abundance) %>% + tidyr::drop_na() %>% + dplyr::select(-!!.transcript) + + # If not enough genes, warning + if(nrow(df.filt.spread)<100) warning(warning_for_scaling_with_few_genes) + # scaled data set nf = tibble::tibble( @@ -1223,10 +1232,7 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, # scaled data frame nf = edgeR::calcNormFactors( - df.filt %>% - tidyr::spread(!!.sample,!!.abundance) %>% - tidyr::drop_na() %>% - dplyr::select(-!!.transcript), + df.filt.spread, refColumn = which(reference == factor(levels( df.filt %>% pull(!!.sample) ))), @@ -1658,3 +1664,53 @@ filter_genes_on_condition = function(.data, .subset_for_scaling){ .data[rownames(.data) %in% my_genes,] } + + +which_NA_matrix = function(.data){ + is_na <- which(is.na(.data), arr.ind=TRUE) + which_is_NA = fill_matrix_with_FALSE(.data ) + which_is_NA[is_na] <- TRUE + which_is_NA +} + +fill_matrix_with_FALSE = function(.data){ + .data[,] = FALSE + .data +} + +rowMedians = function(.data, na.rm){ + apply(.data, 1, median, na.rm=na.rm) +} + +fill_NA_matrix_with_factor_colwise = function(.data, factor){ + + rn = rownames(.data) + cn = colnames(.data) + + .data %>% + t %>% + split.data.frame(factor) %>% + map(~ t(.x)) %>% + + # Fill + map( + ~ { + k <- which(is.na(.x), arr.ind=TRUE) + .x[k] <- rowMedians(.x, na.rm=TRUE)[k[,1]] + .x + } + ) %>% + + # Add NA factors if any + when( + is.na(factor) %>% length() %>% gt(0) ~ (.) %>% c(list(.data[,is.na(factor)])), + ~ (.) + ) %>% + + # Merge + reduce(cbind) %>% + + # Reorder rows and column as it was + .[rn, cn] + +} diff --git a/man/fill_NA_using_formula.Rd b/man/fill_NA_using_formula.Rd index f44f40e2..4824695a 100644 --- a/man/fill_NA_using_formula.Rd +++ b/man/fill_NA_using_formula.Rd @@ -10,7 +10,8 @@ fill_NA_using_formula( .sample = NULL, .transcript = NULL, .abundance = NULL, - .abundance_scaled = NULL + .abundance_scaled = NUL, + suffix = "_imputed" ) } \arguments{ diff --git a/man/impute_missing_abundance-methods.Rd b/man/impute_missing_abundance-methods.Rd index fdb77e74..20ef0c04 100644 --- a/man/impute_missing_abundance-methods.Rd +++ b/man/impute_missing_abundance-methods.Rd @@ -15,7 +15,8 @@ impute_missing_abundance( .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL + .abundance = NULL, + suffix = "" ) \S4method{impute_missing_abundance}{spec_tbl_df}( @@ -23,7 +24,8 @@ impute_missing_abundance( .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL + .abundance = NULL, + suffix = "" ) \S4method{impute_missing_abundance}{tbl_df}( @@ -31,7 +33,8 @@ impute_missing_abundance( .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL + .abundance = NULL, + suffix = "" ) \S4method{impute_missing_abundance}{tidybulk}( @@ -39,12 +42,27 @@ impute_missing_abundance( .formula, .sample = NULL, .transcript = NULL, - .abundance = NULL + .abundance = NULL, + suffix = "" ) -\S4method{impute_missing_abundance}{SummarizedExperiment}(.data, .formula) +\S4method{impute_missing_abundance}{SummarizedExperiment}( + .data, + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + suffix = "" +) -\S4method{impute_missing_abundance}{RangedSummarizedExperiment}(.data, .formula) +\S4method{impute_missing_abundance}{RangedSummarizedExperiment}( + .data, + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + suffix = "" +) } \arguments{ \item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))}