Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sparse counts #211

Merged
merged 25 commits into from
Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
3e398e1
improve error for attributes missing
stemangiola Aug 26, 2021
a0b23b4
update imputation and warning for scaling
stemangiola Aug 26, 2021
b0dc5c6
improve error for attributes missing
stemangiola Aug 26, 2021
3a72440
update imputation and warning for scaling
stemangiola Aug 26, 2021
a769d48
Merge branch 'sparse-counts' of github.com:stemangiola/tidybulk into …
stemangiola Aug 27, 2021
5f77beb
improved imputation for SE, tibble still to do
stemangiola Aug 28, 2021
ab1139a
improve error for attributes missing
stemangiola Aug 26, 2021
240dc41
update imputation and warning for scaling
stemangiola Aug 26, 2021
2b1fbac
improve error for attributes missing
stemangiola Aug 26, 2021
7923805
update imputation and warning for scaling
stemangiola Aug 26, 2021
3432e98
improved imputation for SE, tibble still to do
stemangiola Aug 28, 2021
6184999
weird commit I have to do
stemangiola Aug 31, 2021
905e822
fix rebase
stemangiola Sep 17, 2021
3c9860e
improve error for attributes missing
stemangiola Aug 26, 2021
dcf1199
update imputation and warning for scaling
stemangiola Aug 26, 2021
064f6bb
improve error for attributes missing
stemangiola Aug 26, 2021
12bf4c5
update imputation and warning for scaling
stemangiola Aug 26, 2021
0f63cfa
improved imputation for SE, tibble still to do
stemangiola Aug 28, 2021
9e15367
improve error for attributes missing
stemangiola Aug 26, 2021
ea040a2
update imputation and warning for scaling
stemangiola Aug 26, 2021
08881a9
improve error for attributes missing
stemangiola Aug 26, 2021
bd65b33
update imputation and warning for scaling
stemangiola Aug 26, 2021
0600647
improved imputation for SE, tibble still to do
stemangiola Aug 28, 2021
6a7ceb8
weird commit I have to do
stemangiola Aug 31, 2021
c275c8e
dummy
stemangiola Sep 17, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions R/dictionary.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@

scaled_string = "_scaled"
adjusted_string = "_adjusted"

warning_for_scaling_with_few_genes = "tidybulk says: There are < 100 features/genes that are present in all you samples. Because edgeR::calcNormFactors does not allow NAs, the scaling is performed on that limited set of features.genes. The scaling could not be accurate, it is adivasble to perform impute_missing_abundance() before scaling. It is possible to filter the imputed counts after scaling."
50 changes: 42 additions & 8 deletions R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -3174,7 +3174,8 @@ fill_NA_using_formula = function(.data,
.sample = NULL,
.transcript = NULL,
.abundance = NULL,
.abundance_scaled = NULL){
.abundance_scaled = NUL,
suffix = "_imputed"){

# Get column names
.sample = enquo(.sample)
Expand All @@ -3194,7 +3195,6 @@ fill_NA_using_formula = function(.data,


# Create NAs for missing sample/transcript pair

.data_completed =
.data %>%

Expand All @@ -3203,9 +3203,29 @@ fill_NA_using_formula = function(.data,
mutate(ct_data = map(ct_data, ~ .x %>% droplevels() %>% complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>%
unnest(ct_data)

# For non scaled counts create a pseudo scale based on library size, then calculate imputed and scale back
abundance_is_int = .data %>% slice(1) %>% pull(!!.abundance) %>% class() %>% equals("integer")
.data =
.data %>%
group_by(!!.sample) %>%
mutate(library_size__ = sum(!!.abundance)) %>%
ungroup() %>%
mutate(!!.abundance := !!.abundance / library_size__)

imputed_column = sprintf("%s%s", quo_name(.abundance), suffix )
imputed_column_scaled = sprintf("%s%s", quo_name(.abundance_scaled), suffix )

# Divide the dataset
.data_OK =
.data %>%
anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula))
anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>%

# Add the imputed column
mutate(!!as.symbol(imputed_column) := !!.abundance) %>%
when(
quo_is_symbol(.abundance_scaled) ~ .x %>%
mutate(!!as.symbol(imputed_column_scaled) := !!.abundance_scaled)
)

.data_FIXED =
.data %>%
Expand All @@ -3216,16 +3236,22 @@ fill_NA_using_formula = function(.data,
.data_completed %>%
filter(!!.abundance %>% is.na) %>%
select(!!.sample, !!.transcript) %>%
left_join(.data %>% pivot_sample(!!.sample)) %>%
left_join(.data %>% pivot_transcript(!!.transcript))
) %>%
left_join(.data %>% pivot_sample(!!.sample), by = quo_name(.sample)) %>%
left_join(.data %>% pivot_transcript(!!.transcript), by = quo_name(.transcript))
)

# Clean environment
rm(.data_completed)
gc()

.data_FIXED %>%

# Group by covariate
nest(cov_data = -c(col_formula, !!.transcript)) %>%
mutate(cov_data = map(cov_data, ~
.x %>%
mutate(
!!.abundance := ifelse(
!!as.symbol(imputed_column) := ifelse(
!!.abundance %>% is.na,
median(!!.abundance, na.rm = TRUE),!!.abundance
)
Expand All @@ -3235,7 +3261,7 @@ fill_NA_using_formula = function(.data,
ifelse_pipe(
quo_is_symbol(.abundance_scaled),
~ .x %>% mutate(
!!.abundance_scaled := ifelse(
!!as.symbol(imputed_column_scaled) := ifelse(
!!.abundance_scaled %>% is.na,
median(!!.abundance_scaled, na.rm = TRUE),!!.abundance_scaled
)
Expand All @@ -3250,6 +3276,11 @@ fill_NA_using_formula = function(.data,
.data_OK %>%
bind_rows(.data_FIXED) %>%

# Scale back the pseudoscaling
mutate(!!.abundance := !!.abundance * library_size__) %>%
select(-library_size__) %>%
when(abundance_is_int ~ mutate(., !!.abundance := as.integer(!!.abundance)), ~ (.)) %>%

# Reattach internals
reattach_internals(.data)

Expand Down Expand Up @@ -3290,6 +3321,9 @@ fill_NA_using_value = function(.data,
.feature = enquo(.transcript)
.value = enquo(.abundance)

# Scale based on library size


# Create NAs for missing element/feature pair
df_to_impute =
.data %>%
Expand Down
9 changes: 6 additions & 3 deletions R/methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -3685,15 +3685,17 @@ setGeneric("impute_missing_abundance", function(.data,
.formula,
.sample = NULL,
.transcript = NULL,
.abundance = NULL)
.abundance = NULL,
suffix = "")
standardGeneric("impute_missing_abundance"))

# Set internal
.impute_missing_abundance = function(.data,
.formula,
.sample = NULL,
.transcript = NULL,
.abundance = NULL)
.abundance = NULL,
suffix = "")
{
# Get column names
.sample = enquo(.sample)
Expand Down Expand Up @@ -3724,7 +3726,8 @@ setGeneric("impute_missing_abundance", function(.data,
.sample = !!.sample,
.transcript = !!.transcript,
.abundance = !!.abundance,
.abundance_scaled = !!.abundance_scaled) %>%
.abundance_scaled = !!.abundance_scaled,
suffix = suffix) %>%

# Reattach internals
reattach_internals(.data)
Expand Down
149 changes: 74 additions & 75 deletions R/methods_SE.R
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,13 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se)
when(nrow(.) == 0 ~ stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data."), ~ (.))

my_assay = assays(.data_filtered) %>% as.list() %>% .[1]
my_counts_filtered = my_assay[[1]]
library_size_filtered = my_counts_filtered %>% colSums()

# Drop genes with NAs, as edgeR::calcNormFactors does not accept them
my_counts_filtered = my_assay[[1]] %>% na.omit()
library_size_filtered = my_counts_filtered %>% colSums(na.rm = TRUE)

# If not enough genes, warning
if(nrow(my_counts_filtered)<100) warning(warning_for_scaling_with_few_genes)

# Set column name for value scaled
value_scaled = my_assay %>% names() %>% paste0(scaled_string)
Expand Down Expand Up @@ -1315,7 +1320,8 @@ setMethod("keep_variable",
edgeR::filterByExpr(
min.count = minimum_counts,
group = string_factor_of_interest,
min.prop = minimum_proportion
min.prop = minimum_proportion,
lib.size = colSums(., na.rm=TRUE)
) %>%
not() %>%
which %>%
Expand Down Expand Up @@ -1956,89 +1962,82 @@ setMethod("pivot_transcript",


.impute_missing_abundance_se = function(.data,
.formula) {
.formula,
.sample = NULL,
.transcript = NULL,
.abundance = NULL,
suffix = "") {

.abundance = enquo(.abundance)

.assay_to_impute =
.abundance %>%
when(
quo_is_symbolic(.) ~ assays(.data)[quo_names(.abundance)],
~ assays(.data)
)



# Split data by formula and impute
imputed_dataframe =
map2(

# Capture assay names as we need to know if scaled is in the name
as.list(.assay_to_impute), names(.assay_to_impute),
~ {

col_formula =
colData(.data) %>%
as_tibble() %>%
select(parse_formula(.formula)) %>%
distinct() %>%
select_if(function(x) is.character(x) | is.logical(x) | is.factor(x)) %>%
colnames
# Pseudo-scale if not scaled
if(!grepl("_scaled", .y)) library_size = colSums(.x, na.rm = TRUE)
if(!grepl("_scaled", .y)) .x = .x / library_size

# Create NAs for missing sample/transcript pair
assays(.data) =
assays(.data) %>%
as.list() %>%
map(~{
.my_data =
.x %>%
as.matrix() %>%
as_tibble(rownames = "transcript") %>%
gather(sample, abundance, -transcript) %>%

# Attach annotation
left_join(
colData(.data) %>%
as_tibble(rownames="sample") %>%
select(sample, col_formula),
by="sample"
)
# Log
need_log = max(.x, na.rm=T) > 50
if(need_log) .x = log1p(.x)

# Data used for filtering
NA_data =
.my_data %>%
filter(abundance %>% is.na) %>%
select( transcript, col_formula) %>%
distinct()

# If no missing just return the same matrix
if(nrow(NA_data) == 0) return(.x)

.data_OK =
.my_data %>%
anti_join(NA_data, by = c("transcript", col_formula))


.data_FIXED =
.my_data %>%
inner_join(NA_data, by = c("transcript", col_formula)) %>%

# Group by covariate
nest(cov_data = -c(col_formula, transcript)) %>%
mutate(cov_data = map(cov_data, ~
.x %>%
mutate(abundance =
case_when(
is.na(abundance) ~ median(abundance, na.rm=TRUE),
TRUE ~ abundance
)
) %>%

# Through warning if group of size 1
when(
nrow(.) %>% st(2) ~ warning("tidybulk says: According to your design matrix, u have sample groups of size < 2, so you your dataset could still be sparse."),
~ (.)
)
)) %>%
unnest(cov_data)

.data_OK %>%
bind_rows(.data_FIXED) %>%
select(-col_formula) %>%
spread(sample, abundance) %>%
as_matrix(rownames = transcript)

})
# Imputation
.x = fill_NA_matrix_with_factor_colwise(
.x,
# I split according to the formula
colData(.data)[,parse_formula(.formula)]
)

.data
# Exp back
if(need_log) .x = exp(.x)-1

# Scale back if pseudoscaled
if(!grepl("_scaled", .y)) .x = .x * library_size

# Return
.x
}
) %>%

# Add imputed to the name
setNames(sprintf("%s%s", names(.assay_to_impute), suffix))

.assays_name_to_port = names(assays(.data)) %>% setdiff(names(.assay_to_impute))

assays(.data) =
as.list(assays(.data))[.assays_name_to_port] %>%
c(imputed_dataframe ) %>%

# Add .imputed column
c(list(.imputed = which_NA_matrix(.assay_to_impute[[1]] ))) %>%

# Make names unique
setNames(names(.) %>% make.unique())


.data %>%

# Reattach internals
reattach_internals(.data)

}



#' impute_missing_abundance
#' @inheritParams impute_missing_abundance
#'
Expand Down
Loading