diff --git a/.Rinstignore b/.Rinstignore new file mode 100644 index 00000000..caf01ec9 --- /dev/null +++ b/.Rinstignore @@ -0,0 +1 @@ +dev/ \ No newline at end of file diff --git a/.coveralls.yml b/.coveralls.yml deleted file mode 100755 index a791cc00..00000000 --- a/.coveralls.yml +++ /dev/null @@ -1,2 +0,0 @@ -service_name: travis-pro -repo_token: O4NscPehU4qrWznFtQRiyJJBIOyRgPzsB diff --git a/.gitignore b/.gitignore old mode 100755 new mode 100644 diff --git a/.travis.yml b/.travis.yml deleted file mode 100755 index 7e9e7d23..00000000 --- a/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -# Adapted from https://github.com/hadley/testthat/blob/master/.travis.yml -# R for travis: see documentation at https://docs.travis-ci.com/user/languages/r -language: r -cache: packages -r: - - bioc-release - - bioc-devel -env: -- R_QPDF=true - -r_github_packages: - - r-lib/covr - -after_success: - - tar -C .. -xf $PKG_TARBALL - - xvfb-run Rscript -e 'covr::codecov(type=c("tests", "vignettes", "examples"))' diff --git a/DESCRIPTION b/DESCRIPTION index bad82c8f..8726b357 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -36,57 +36,56 @@ Imports: methods, S4Vectors, crayon, - Matrix + Matrix, + pkgconfig, + AnnotationDbi, + DESeq2, + Rtsne, + Seurat, + betareg, + boot, + broom, + class, + clusterProfiler, + e1071, + edgeR, + functional, + glmmSeq, + glmmTMB, + limma, + lme4, + matrixStats, + msigdbr, + org.Hs.eg.db, + org.Mm.eg.db, + pbapply, + pbmcapply, + survival, + survminer, + sva, + uwot, + widyr Suggests: + tidySummarizedExperiment, + tidyHeatmap, BiocStyle, testthat, vctrs, - AnnotationDbi, BiocManager, Rsubread, - e1071, - edgeR, - limma, - org.Hs.eg.db, - org.Mm.eg.db, - sva, GGally, knitr, qpdf, covr, - Seurat, KernSmooth, - Rtsne, ggplot2, - widyr, - clusterProfiler, - msigdbr, - DESeq2, - broom, - survival, - boot, - betareg, - tidyHeatmap, pasilla, ggrepel, - devtools, - functional, - survminer, - tidySummarizedExperiment, markdown, - uwot, - matrixStats, igraph, - EGSEA, IRanges, here, - glmmSeq, - pbapply, - pbmcapply, - lme4, - glmmTMB, - MASS, - pkgconfig + MASS VignetteBuilder: knitr RdMacros: diff --git a/NAMESPACE b/NAMESPACE old mode 100755 new mode 100644 index 17b09661..304bc3b8 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,37 +16,29 @@ S3method(rename,tidybulk) S3method(right_join,tidybulk) S3method(rowwise,tidybulk) S3method(summarise,tidybulk) +S3method(summarize,tidybulk) S3method(ungroup,tidybulk) S3method(unnest,nested_tidybulk) +S3method(validation,tidybulk) export("%>%") export(adjust_abundance) export(aggregate_duplicates) -export(arrange) export(as_SummarizedExperiment) export(as_matrix) export(as_tibble) -export(bind_cols) export(cluster_elements) export(deconvolve_cellularity) export(describe_transcript) -export(distinct) export(do) export(ensembl_to_symbol) export(fill_missing_abundance) -export(filter) -export(full_join) export(get_bibliography) -export(group_by) export(identify_abundant) export(impute_missing_abundance) -export(inner_join) export(keep_abundant) export(keep_variable) -export(left_join) export(log10_reverse_trans) export(logit_trans) -export(mutate) -export(nest) export(pivot_sample) export(pivot_transcript) export(quantile_normalise_abundance) @@ -56,10 +48,8 @@ export(rename) export(resolve_complete_confounders_of_non_interest) export(right_join) export(rotate_dimensions) -export(rowwise) export(scale_abundance) export(select) -export(summarise) export(symbol_to_entrez) export(test_differential_abundance) export(test_differential_cellularity) @@ -70,14 +60,16 @@ export(test_stratification_cellularity) export(tibble) export(tidybulk) export(tidybulk_SAM_BAM) -export(unnest) +export(validation_default) exportMethods(as_SummarizedExperiment) exportMethods(quantile_normalise_abundance) exportMethods(scale_abundance) exportMethods(tidybulk) exportMethods(tidybulk_SAM_BAM) import(SummarizedExperiment) +import(e1071) import(lifecycle) +import(lme4) import(methods) import(parallel) import(preprocessCore) @@ -91,6 +83,9 @@ importFrom(SummarizedExperiment,assays) importFrom(SummarizedExperiment,colData) importFrom(SummarizedExperiment,rowData) importFrom(SummarizedExperiment,rowRanges) +importFrom(betareg,betareg) +importFrom(boot,logit) +importFrom(broom,tidy) importFrom(dplyr,across) importFrom(dplyr,anti_join) importFrom(dplyr,arrange) @@ -123,7 +118,10 @@ importFrom(dplyr,slice) importFrom(dplyr,starts_with) importFrom(dplyr,summarise) importFrom(dplyr,summarise_all) +importFrom(dplyr,summarize) importFrom(dplyr,ungroup) +importFrom(functional,Compose) +importFrom(glmmTMB,glmmTMBControl) importFrom(lifecycle,deprecate_soft) importFrom(lifecycle,deprecate_warn) importFrom(magrittr,"%$%") @@ -134,7 +132,12 @@ importFrom(magrittr,extract2) importFrom(magrittr,multiply_by) importFrom(magrittr,set_colnames) importFrom(magrittr,set_rownames) +importFrom(matrixStats,colSds) +importFrom(parallel,clusterExport) +importFrom(parallel,makeCluster) importFrom(parallel,mclapply) +importFrom(pbapply,pblapply) +importFrom(pbmcapply,pbmclapply) importFrom(purrr,as_mapper) importFrom(purrr,map) importFrom(purrr,map2) @@ -151,7 +154,6 @@ importFrom(rlang,":=") importFrom(rlang,dots_list) importFrom(rlang,dots_values) importFrom(rlang,enquo) -importFrom(rlang,enquos) importFrom(rlang,flatten_if) importFrom(rlang,inform) importFrom(rlang,is_spliced) @@ -178,9 +180,7 @@ importFrom(stats,model.matrix) importFrom(stats,na.omit) importFrom(stats,p.adjust) importFrom(stats,pchisq) -importFrom(stats,plogis) importFrom(stats,prcomp) -importFrom(stats,qlogis) importFrom(stats,rnorm) importFrom(stats,sd) importFrom(stats,setNames) @@ -192,6 +192,10 @@ importFrom(stringr,str_remove) importFrom(stringr,str_replace) importFrom(stringr,str_replace_all) importFrom(stringr,str_split) +importFrom(survival,coxph) +importFrom(survival,survdiff) +importFrom(survminer,ggsurvplot) +importFrom(survminer,surv_fit) importFrom(tibble,as_tibble) importFrom(tibble,enframe) importFrom(tibble,rowid_to_column) diff --git a/R/attach.R b/R/attach.R new file mode 100644 index 00000000..8e58b773 --- /dev/null +++ b/R/attach.R @@ -0,0 +1,22 @@ +core <- c("dplyr", "tidyr", "ttservice") + +core_unloaded <- function() { + search <- paste0("package:", core) + core[!search %in% search()] +} + + +same_library <- function(pkg) { + loc <- if (pkg %in% loadedNamespaces()) + dirname(getNamespaceInfo(pkg, "path")) + library(pkg, lib.loc=loc, character.only=TRUE, warn.conflicts=FALSE) +} + +tidyverse_attach <- function() { + to_load <- core_unloaded() + + suppressPackageStartupMessages( + lapply(to_load, same_library)) + + invisible(to_load) +} diff --git a/R/cibersort.R b/R/cibersort.R index 4694c04a..b396d5a9 100755 --- a/R/cibersort.R +++ b/R/cibersort.R @@ -202,8 +202,11 @@ call_core = function(itor, Y, X, P, pval, CoreAlg){ } +#' @import e1071 +#' @import parallel +#' @import preprocessCore #' @importFrom stats sd -#' @importFrom utils install.packages +#' @importFrom matrixStats colSds #' #' @keywords internal #' @@ -255,12 +258,6 @@ my_CIBERSORT <- function(Y, X, perm=0, QN=TRUE, cores = 3, exp_transform = FALSE )) Y=Y[,colSums(Y)>0, drop=FALSE] - # Check if package is installed, otherwise install - if (find.package("matrixStats", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing matrixStats needed for cibersort") - install.packages("matrixStats", repos = "https://cloud.r-project.org") - } - # Eliminate sd == 0 if(length(which(matrixStats::colSds(Y)==0))>0) warning(sprintf( diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R index 0f43685b..d84ca57c 100755 --- a/R/dplyr_methods.R +++ b/R/dplyr_methods.R @@ -1,70 +1,13 @@ - -#' @export -dplyr::select - -#' Arrange rows by column values -#' -#' -#' @description -#' `arrange()` order the rows of a data frame rows by the values of selected -#' columns. -#' -#' Unlike other dplyr verbs, `arrange()` largely ignores grouping; you -#' need to explicit mention grouping variables (or use `by_group = TRUE`) -#' in order to group by them, and functions of variables are evaluated -#' once per data frame, not once per group. -#' -#' @details -#' ## Locales -#' The sort order for character vectors will depend on the collating sequence -#' of the locale in use: see [locales()]. -#' -#' ## Missing values -#' Unlike base sorting with `sort()`, `NA` are: -#' * always sorted to the end for local data, even when wrapped with `desc()`. -#' * treated differently for remote data, depending on the backend. -#' -#' @return -#' An object of the same type as `.data`. -#' -#' * All rows appear in the output, but (usually) in a different place. -#' * Columns are not modified. -#' * Groups are not modified. -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' -#' @param .data A data frame, data frame extension (e.g. a tibble), or a -#' lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for -#' more details. -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or -#' variables. Use [desc()] to sort a variable in descending order. -#' @param .by_group If TRUE, will sort first by grouping variable. Applies to grouped data frames only. -#' -#' @return A tibble -#' @family single table verbs -#' -#' @rdname arrange-methods #' @name arrange +#' @rdname arrange +#' @inherit dplyr::arrange +#' @family single table verbs #' @importFrom dplyr arrange -#' -#' @examples -#' -#' arrange(mtcars, cyl, disp) -#' -#' @export -NULL - #' @export arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { - - .data |> - drop_class(c("tidybulk", "tt")) |> - dplyr::arrange( ..., .by_group = .by_group) |> + .data |> + drop_class(c("tidybulk", "tt")) |> + dplyr::arrange( ..., .by_group = .by_group) |> # Attach attributes reattach_internals(.data) |> @@ -72,130 +15,68 @@ arrange.tidybulk <- function(.data, ..., .by_group = FALSE) { # Add class add_class("tt") |> add_class("tidybulk") - } -#' Efficiently bind multiple data frames by row and column -#' -#' This is an efficient implementation of the common pattern of -#' `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many -#' data frames into one. -#' -#' The output of `bind_rows()` will contain a column if that column -#' appears in any of the inputs. -#' -#' @param ... Data frames to combine. -#' -#' Each argument can either be a data frame, a list that could be a data -#' frame, or a list of data frames. -#' -#' When row-binding, columns are matched by name, and any missing -#' columns will be filled with NA. -#' -#' When column-binding, rows are matched by position, so all data -#' frames must have the same number of rows. To match by value, not -#' position, see mutate-joins. -#' @param .id Data frame identifier. -#' -#' When `.id` is supplied, a new column of identifiers is -#' created to link each row to its original data frame. The labels -#' are taken from the named arguments to `bind_rows()`. When a -#' list of data frames is supplied, the labels are taken from the -#' names of the list. If no names are found a numeric sequence is -#' used instead. -#' @param add.cell.ids from Seurat 3.0 A character vector of length(x = c(x, y)). Appends the corresponding values to the start of each objects' cell names. -#' -#' @importFrom ttservice bind_rows +#' @name bind_rows +#' @rdname bind_rows +#' @inherit ttservice::bind_rows #' -#' @return `bind_rows()` and `bind_cols()` return the same type as -#' the first input, either a data frame, `tbl_df`, or `grouped_df`. #' @examples #' data(se_mini) #' -#' se_mini_tidybulk = se_mini |> tidybulk() -#' bind_rows( se_mini_tidybulk, se_mini_tidybulk ) +#' se_mini_tidybulk <- se_mini |> tidybulk() +#' bind_rows(se_mini_tidybulk, se_mini_tidybulk) #' -#' tt_bind = se_mini_tidybulk |> select(time, condition) +#' tt_bind <- se_mini_tidybulk |> select(time, condition) #' se_mini_tidybulk |> bind_cols(tt_bind) #' -#' @name bind_rows -NULL - #' @importFrom rlang dots_values #' @importFrom rlang flatten_if #' @importFrom rlang is_spliced -#' +#' @importFrom ttservice bind_rows #' @export -#' -bind_rows.tidybulk <- function(..., .id = NULL) -{ +bind_rows.tidybulk <- function(..., .id = NULL) { + tts <- flatten_if(dots_values(...), is_spliced) - tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - par1 = tts[[1]] |> get_tt_columns() |> unlist() - par2 = tts[[2]] |> get_tt_columns() |> unlist() + par1 <- tts[[1]] |> get_tt_columns() |> unlist() + par2 <- tts[[2]] |> get_tt_columns() |> unlist() # # tt_columns of the two objects must match # error_if_parameters_not_match(par1, par2) ttservice:::bind_rows.data.frame(..., .id = .id) |> - # Attach attributes reattach_internals(tts[[1]]) - } -#' @export -#' -#' @importFrom ttservice bind_cols -#' @inheritParams bind_cols -#' -#' @rdname dplyr-methods -#' @name bind_cols -NULL - #' @importFrom rlang dots_values #' @importFrom rlang flatten_if #' @importFrom rlang is_spliced -#' -#' @export -#' -bind_cols.tidybulk <- function(..., .id = NULL) -{ - - tts = tts = flatten_if(dots_values(...), is_spliced) # Original that fails Bioconductor dplyr:::flatten_bindable(rlang::dots_values(...)) - - dplyr::bind_cols(..., .id = .id) |> - - # Attach attributes - reattach_internals(tts[[1]]) - +#' @importFrom ttservice bind_cols +bind_cols_ <- function(..., .id = NULL) { + tts <- tts <- flatten_if(dots_values(...), is_spliced) + + ttservice::bind_cols(..., .id = .id) |> + # Attach attributes + reattach_internals(tts[[1]]) } -#' distinct -#' @param .data A tbl. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' @param .keep_all If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr) -#' -#' @return A tt object -#' -#' @rdname distinct-methods +#' @rdname bind_rows +#' @aliases bind_cols +#' @export +bind_cols.tidybulk <- bind_cols_ + #' @name distinct -#' @importFrom dplyr distinct +#' @rdname distinct +#' @inherit dplyr::distinct #' #' @examples -#' -#' tidybulk::se_mini |> tidybulk() |> distinct() -#' -#' -#' @export -NULL - - -#' @inheritParams distinct +#' data(se_mini) +#' se_mini |> tidybulk() |> distinct() +#' +#' @importFrom dplyr distinct #' @export -distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) -{ +distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::distinct(..., .keep_all = .keep_all) |> @@ -206,84 +87,23 @@ distinct.tidybulk <- function (.data, ..., .keep_all = FALSE) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Subset rows using column values -#' -#' `filter()` retains the rows where the conditions you provide a `TRUE`. Note -#' that, unlike base subsetting with `[`, rows where the condition evaluates -#' to `NA` are dropped. -#' -#' dplyr is not yet smart enough to optimise filtering optimisation -#' on grouped datasets that don't need grouped calculations. For this reason, -#' filtering is often considerably faster on [ungroup()]ed data. -#' -#' @section Useful filter functions: -#' -#' * [`==`], [`>`], [`>=`] etc -#' * [`&`], [`|`], [`!`], [xor()] -#' * [is.na()] -#' * [between()], [near()] -#' -#' @section Grouped tibbles: -#' -#' Because filtering expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped filtering: -#' -#' -#' The former keeps rows with `mass` greater than the global average -#' whereas the latter keeps rows with `mass` greater than the gender -#' -#' average. -#' @family single table verbs -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in -#' terms of the variables in `.data`. -#' Multiple conditions are combined with `&`. Only rows where the -#' condition evaluates to `TRUE` are kept. -#' @param .preserve when `FALSE` (the default), the grouping structure -#' is recalculated based on the resulting data, otherwise it is kept as is. -#' @return -#' An object of the same type as `.data`. -#' -#' * Rows are a subset of the input, but appear in the same order. -#' * Columns are not modified. -#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). -#' * Data frame attributes are preserved. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @seealso [filter_all()], [filter_if()] and [filter_at()]. -#' -#' @rdname filter-methods #' @name filter -#' -#' @importFrom dplyr filter -#' -#' @export +#' @rdname filter +#' @inherit dplyr::filter #' #' @examples -#' #' data(se) -#' #' se |> tidybulk() |> filter(dex=="untrt") -#' #' # Learn more in ?dplyr_tidy_eval -NULL - -#' @inheritParams filter +#' +#' @importFrom dplyr filter #' @export -filter.tidybulk <- function (.data, ..., .preserve = FALSE) -{ +filter.tidybulk <- function (.data, ..., .preserve = FALSE) { .data |> drop_class(c("tidybulk", "tt")) |> - dplyr::filter( ..., .preserve = .preserve) |> + dplyr::filter(..., .preserve = .preserve) |> # Attach attributes reattach_internals(.data) |> @@ -291,55 +111,16 @@ filter.tidybulk <- function (.data, ..., .preserve = FALSE) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Group by one or more variables -#' -#' @description -#' Most data operations are done on groups defined by variables. -#' `group_by()` takes an existing tbl and converts it into a grouped tbl -#' where operations are performed "by group". `ungroup()` removes grouping. -#' -#' @family grouping functions -#' @param .data A tbl. (See dplyr) -#' @param ... In `group_by()`, variables or computations to group by. -#' In `ungroup()`, variables to remove from the grouping. -#' @param .add When `FALSE`, the default, `group_by()` will -#' override existing groups. To add to the existing groups, use -#' `.add = TRUE`. -#' -#' This argument was previously called `add`, but that prevented -#' creating a new grouping variable called `add`, and conflicts with -#' our naming conventions. -#' @param .drop When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -#' what the default value is for this argument. -#' @return A [grouped data frame][grouped_df()], unless the combination of `...` and `add` -#' yields a non empty set of grouping columns, a regular (ungrouped) data frame -#' otherwise. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' @rdname group_by-methods #' @name group_by -#' @importFrom dplyr group_by -#' -#' @export -#' -#' @examples -#' -#' by_cyl <- mtcars |> group_by(cyl) -#' -NULL - +#' @rdname group_by +#' @inherit dplyr::group_by #' @importFrom dplyr group_by_drop_default +#' @importFrom dplyr group_by #' @export -group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) -{ +group_by.tidybulk <- function (.data, ..., .add = FALSE, + .drop = group_by_drop_default(.data)) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::group_by( ..., .drop = .drop) |> @@ -350,20 +131,14 @@ group_by.tidybulk <- function (.data, ..., .add = FALSE, .drop = group_by_drop_d # Add class add_class("tt") |> add_class("tidybulk") - } - -#' @rdname ungroup-methods #' @name ungroup +#' @rdname ungroup +#' @inherit dplyr::ungroup #' @importFrom dplyr ungroup -#' -#' @param x A [tbl()] -#' @param ... See dplyr -#' #' @export -ungroup.tidybulk <- function (x, ...) -{ +ungroup.tidybulk <- function (x, ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::ungroup( ...) |> @@ -374,89 +149,15 @@ ungroup.tidybulk <- function (x, ...) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Summarise each group to fewer rows -#' -#' @description -#' `summarise()` creates a new data frame. It will have one (or more) rows for -#' each combination of grouping variables; if there are no grouping variables, -#' the output will have a single row summarising all observations in the input. -#' It will contain one column for each grouping variable and one column -#' for each of the summary statistics that you have specified. -#' -#' `summarise()` and `summarize()` are synonyms. -#' -#' @section Useful functions: -#' -#' * Center: [mean()], [median()] -#' * Spread: [sd()], [IQR()], [mad()] -#' * Range: [min()], [max()], [quantile()] -#' * Position: [first()], [last()], [nth()], -#' * Count: [n()], [n_distinct()] -#' * Logical: [any()], [all()] -#' -#' @section Backend variations: -#' -#' The data frame backend supports creating a variable and using it in the -#' same summary. This means that previously created summary variables can be -#' further transformed or combined within the summary, as in [mutate()]. -#' However, it also means that summary variables with the same names as previous -#' variables overwrite them, making those variables unavailable to later summary -#' variables. -#' -#' This behaviour may not be supported in other backends. To avoid unexpected -#' results, consider using new names for your summary variables, especially when -#' creating multiple summaries. -#' -#' @export -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary -#' functions. The name will be the name of the variable in the result. -#' -#' The value can be: -#' -#' * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. -#' * A vector of length `n`, e.g. `quantile()`. -#' * A data frame, to add multiple columns from a single expression. -#' @family single table verbs -#' @return -#' An object _usually_ of the same type as `.data`. -#' -#' * The rows come from the underlying `group_keys()`. -#' * The columns are a combination of the grouping keys and the summary -#' expressions that you provide. -#' * If `x` is grouped by more than one variable, the output will be another -#' [grouped_df] with the right-most group removed. -#' * If `x` is grouped by one variable, or is not grouped, the output will -#' be a [tibble]. -#' * Data frame attributes are **not** preserved, because `summarise()` -#' fundamentally creates a new data frame. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @examples -#' -#' # A summary applied to ungrouped tbl returns a single row -#' -#' mtcars |> -#' summarise(mean = mean(disp)) -#' -#' -#' @rdname summarise-methods #' @name summarise +#' @aliases summarize +#' @inherit dplyr::summarise +#' @family single table verbs #' @importFrom dplyr summarise #' @export -NULL - -#' @inheritParams summarise -#' @export -summarise.tidybulk <- function (.data, ...) -{ +summarise.tidybulk <- function (.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::summarise( ...) |> @@ -467,102 +168,21 @@ summarise.tidybulk <- function (.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - } -#' Create, modify, and delete columns -#' -#' `mutate()` adds new variables and preserves existing ones; -#' `transmute()` adds new variables and drops existing ones. -#' New variables overwrite existing variables of the same name. -#' Variables can be removed by setting their value to `NULL`. -#' -#' @section Useful mutate functions: -#' -#' * [`+`], [`-`], [log()], etc., for their usual mathematical meanings -#' -#' * [lead()], [lag()] -#' -#' * [dense_rank()], [min_rank()], [percent_rank()], [row_number()], -#' [cume_dist()], [ntile()] -#' -#' * [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] -#' -#' * [na_if()], [coalesce()] -#' -#' * [if_else()], [recode()], [case_when()] -#' -#' @section Grouped tibbles: -#' -#' Because mutating expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped mutate: -#' -#' With the grouped equivalent: -#' -#' The former normalises `mass` by the global average whereas the -#' latter normalises by the averages within gender levels. -#' +#' @name summarise +#' @rdname summarise +#' @importFrom dplyr summarize #' @export -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. -#' The name gives the name of the column in the output. -#' -#' The value can be: -#' -#' * A vector of length 1, which will be recycled to the correct length. -#' * A vector the same length as the current group (or the whole data frame -#' if ungrouped). -#' * `NULL`, to remove the column. -#' * A data frame or tibble, to create multiple columns in the output. -#' -#' @family single table verbs -#' @return -#' An object of the same type as `.data`. -#' -#' For `mutate()`: -#' -#' * Rows are not affected. -#' * Existing columns will be preserved unless explicitly modified. -#' * New columns will be added to the right of existing columns. -#' * Columns given value `NULL` will be removed -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' -#' For `transmute()`: -#' -#' * Rows are not affected. -#' * Apart from grouping variables, existing columns will be remove unless -#' explicitly kept. -#' * Column order matches order of expressions. -#' * Groups will be recomputed if a grouping variable is mutated. -#' * Data frame attributes are preserved. -#' @section Methods: -#' These function are **generic**s, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' Methods available in currently loaded packages: -#' -#' @examples -#' -#' # Newly created variables are available immediately -#' mtcars |> as_tibble() |> mutate( -#' cyl2 = cyl * 2, -#' cyl4 = cyl2 * 2 -#' ) -#' -#' @rdname mutate-methods +summarize.tidybulk <- summarise.tidybulk + #' @name mutate +#' @rdname mutate +#' @inherit dplyr::mutate +#' @family single table verbs #' @importFrom dplyr mutate #' @export -NULL - -#' @inheritParams mutate -#' @export -mutate.tidybulk <- function(.data, ...) -{ +mutate.tidybulk <- function(.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::mutate(...) |> @@ -573,14 +193,12 @@ mutate.tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' @inheritParams mutate +#' @inherit dplyr::mutate +#' @importFrom dplyr mutate #' @export -mutate.nested_tidybulk <- function(.data, ...) -{ +mutate.nested_tidybulk <- function(.data, ...) { .data |> drop_class(c("nested_tidybulk", "tt")) |> dplyr::mutate(...) |> @@ -591,52 +209,15 @@ mutate.nested_tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("nested_tidybulk") - - } -#' Rename columns -#' -#' Rename individual variables using `new_name = old_name` syntax. -#' -#' @section Scoped selection and renaming: -#' -#' Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) -#' to renaming a set of variables with a function. -#' -#' @param .data A tbl. (See dplyr) -#' @param ... <[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` -#' to rename selected variables. -#' @return -#' An object of the same type as `.data`. -#' * Rows are not affected. -#' * Column names are changed; column order is preserved -#' * Data frame attributes are preserved. -#' * Groups are updated to reflect new names. -#' @section Methods: -#' This function is a **generic**, which means that packages can provide -#' implementations (methods) for other classes. See the documentation of -#' individual methods for extra arguments and differences in behaviour. -#' -#' The following methods are currently available in loaded packages: -#' @family single table verbs -#' @export -#' -#' @examples -#' -#' iris <- as_tibble(iris) # so it prints a little nicer -#' rename(iris, petal_length = Petal.Length) -#' -#' @rdname rename-methods #' @name rename +#' @rdname rename +#' @inherit dplyr::rename +#' @family single table verbs #' @importFrom dplyr rename #' @export -NULL - -#' @inheritParams rename -#' @export -rename.tidybulk <- function(.data, ...) -{ +rename.tidybulk <- function(.data, ...) { .data |> drop_class(c("tidybulk", "tt")) |> dplyr::rename(...) |> @@ -647,49 +228,14 @@ rename.tidybulk <- function(.data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' Group input by rows -#' -#' -#' See [this repository](https://github.com/jennybc/row-oriented-workflows) -#' for alternative ways to perform row-wise operations. -#' -#' `rowwise()` is used for the results of [do()] when you -#' create list-variables. It is also useful to support arbitrary -#' complex operations that need to be applied to each row. -#' -#' Currently, rowwise grouping only works with data frames. Its -#' main impact is to allow you to work with list-variables in -#' [summarise()] and [mutate()] without having to -#' use \code{[[1]]}. This makes `summarise()` on a rowwise tbl -#' effectively equivalent to [plyr::ldply()]. -#' -#' @param data Input data frame. -#' @param ... Variables to be preserved when calling summarise(). This is typically a set of variables whose combination uniquely identify each row. NB: unlike group_by() you can not create new variables here but instead you can select multiple variables with (e.g.) everything(). -#' -#' @return A consistent object (to the input) -#' -#' A `tbl` -#' -#' @export -#' @examples -#' -#' df <- expand.grid(x = 1:3, y = 3:1) -#' df_done <- df |> rowwise() -#' -#' @rdname rowwise-methods #' @name rowwise +#' @rdname rowwise +#' @inherit dplyr::rowwise #' @importFrom dplyr rowwise #' @export -NULL - -#' @inheritParams rowwise -#' @export -rowwise.tidybulk <- function(data, ...) -{ +rowwise.tidybulk <- function(data, ...) { data |> drop_class(c("tidybulk", "tt")) |> dplyr::rowwise() |> @@ -700,37 +246,22 @@ rowwise.tidybulk <- function(data, ...) # Add class add_class("tt") |> add_class("tidybulk") - - } -#' Left join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object -#' +#' @name left_join +#' @rdname left_join +#' @inherit dplyr::left_join +#' #' @examples +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) #' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) -#' -#' @rdname dplyr-methods -#' @name left_join #' @importFrom dplyr left_join #' @export -NULL - -#' @inheritParams left_join -#' @export -left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::left_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -741,35 +272,22 @@ left_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } -#' Inner join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object +#' @name inner_join +#' @rdname inner_join +#' @inherit dplyr::inner_join #' #' @examples -#' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) +#' data(se_mini) +#' annotation <- tidybulk::se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) #' -#' @rdname join-methods -#' @name inner_join #' @importFrom dplyr inner_join #' @export -NULL - -#' @inheritParams inner_join -#' @export -inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) -{ +inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::inner_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -780,36 +298,22 @@ inner_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } -#' Right join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object -#' -#' @examples +#' @name right_join +#' @rdname right_join +#' @inherit dplyr::right_join #' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) +#' @examples +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) #' -#' @rdname join-methods -#' @name right_join #' @importFrom dplyr right_join #' @export -NULL - -#' @inheritParams right_join -#' @export -right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::right_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -820,37 +324,22 @@ right_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } - -#' Full join datasets -#' -#' @param x tbls to join. (See dplyr) -#' @param y tbls to join. (See dplyr) -#' @param by A character vector of variables to join by. (See dplyr) -#' @param copy If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr) -#' @param suffix If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr) -#' @param ... Data frames to combine (See dplyr) -#' -#' @return A tt object +#' @name full_join +#' @rdname full_join +#' @inherit dplyr::full_join #' #' @examples -#' -#' annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) +#' data(se_mini) +#' annotation <- se_mini |> tidybulk() |> as_tibble() |> +#' distinct(.sample) |> mutate(source = "AU") +#' se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) #' -#' @rdname join-methods -#' @name full_join #' @importFrom dplyr full_join #' @export -NULL - -#' @inheritParams full_join -#' @export -full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), - ...) -{ +full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, + suffix = c(".x", ".y"), ...) { x |> drop_class(c("tidybulk", "tt")) |> dplyr::full_join(y, by = by, copy = copy, suffix = suffix, ...) |> @@ -861,9 +350,12 @@ full_join.tidybulk <- function (x, y, by = NULL, copy = FALSE, suffix = c(".x", # Add class add_class("tt") |> add_class("tidybulk") - } #' @importFrom dplyr do #' @export dplyr::do + +#' @importFrom dplyr select +#' @export +dplyr::select diff --git a/R/functions.R b/R/functions.R index 9dec0e03..e07cbb37 100755 --- a/R/functions.R +++ b/R/functions.R @@ -15,7 +15,7 @@ #' @return A tibble with an additional column #' #' -create_tt_from_tibble_bulk = function(.data, +create_tt_from_tibble_bulk <- function(.data, .sample, .transcript, .abundance, @@ -47,13 +47,14 @@ create_tt_from_tibble_bulk = function(.data, #' @importFrom purrr reduce #' #' @param file_names A character vector -#' @param genome A character string specifying an in-built annotation used for read summarization. It has four possible values including "mm10", "mm9", "hg38" and "hg19" +#' @param genome A character string specifying an in-built annotation used for +#' read summarization. It has four possible values including "mm10", "mm9", +#' "hg38" and "hg19" #' @param ... Further parameters passed to the function Rsubread::featureCounts #' #' @return A tibble of gene counts #' -create_tt_from_bam_sam_bulk <- - function(file_names, genome = "hg38", ...) { +create_tt_from_bam_sam_bulk <- function(file_names, genome = "hg38", ...) { # This function uses Subread to count the gene features, # annotate gene features with symbols, and # convert the data frame to tibble format @@ -76,7 +77,8 @@ create_tt_from_bam_sam_bulk <- edgeR::DGEList( counts = (.)$counts, genes = (.)$annotation[, c("GeneID", "Length")], - samples = (.)$stat %>% as_tibble() %>% gather(sample, temp,-Status) %>% spread(Status, temp) + samples = (.)$stat %>% as_tibble() %>% gather(sample, temp,-Status) %>% + spread(Status, temp) ) } %>% @@ -162,7 +164,8 @@ create_tt_from_bam_sam_bulk <- #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A string character. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param method A string character. The scaling method passed to the backend +#' function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") #' #' #' @return A list including the filtered data frame and the normalization factors @@ -248,8 +251,10 @@ add_scaled_counts_bulk.calcNormFactor <- function(.data, #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference. +#' @param method A character string. The scaling method passed to the backend +#' function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param reference_sample A character string. The name of the reference sample. +#' If NULL the sample with highest total read count will be selected as reference. #' #' #' @return A tibble including additional columns @@ -304,7 +309,9 @@ get_scaled_counts_bulk <- function(.data, ) # Communicate the reference if chosen by default - if(is.null(reference_sample)) message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference)) + if(is.null(reference_sample)) message( + sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", + reference)) nf_obj <- add_scaled_counts_bulk.calcNormFactor( @@ -378,27 +385,36 @@ get_scaled_counts_bulk <- function(.data, #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param test_above_log2_fold_change A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +#' "edgeR_likelihood_ratio" (i.e., LRT) +#' @param test_above_log2_fold_change A positive real value. This works for +#' edgeR and limma_voom methods. It uses the `treat` function, which tests +#' that the difference in abundance is bigger than this threshold rather than +#' zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE", +#' "upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' @param .sample_total_read_count #' #' @return A tibble with edgeR results #' -get_differential_transcript_abundance_bulk <- function(.data, - .formula, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - .contrasts = NULL, - method = "edgeR_quasi_likelihood", - test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "", - .sample_total_read_count = NULL) { +get_differential_transcript_abundance_bulk <- function(data, + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + .contrasts = NULL, + method = "edgeR_quasi_likelihood", + test_above_log2_fold_change = NULL, + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "", + .sample_total_read_count = NULL) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -462,7 +478,9 @@ get_differential_transcript_abundance_bulk <- function(.data, design = model.matrix( object = .formula, - data = df_for_edgeR %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample) + data = df_for_edgeR %>% + select(!!.sample, any_of(parse_formula(.formula))) %>% + distinct %>% arrange(!!.sample) ) # # Print the design column names in case I want contrasts @@ -475,7 +493,8 @@ get_differential_transcript_abundance_bulk <- function(.data, # Replace `:` with ___ because it creates error with edgeR if(design |> colnames() |> str_detect(":") |> any()) { - message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.") + message("tidybulk says: the interaction term `:` has been replaced ", + "with `___` in the design matrix, in order to work with edgeR.") colnames(design) = design |> colnames() |> str_replace(":", "___") } @@ -501,7 +520,9 @@ get_differential_transcript_abundance_bulk <- function(.data, colnames(design)[1] != "(Intercept)" ) ) - warning("tidybulk says: If you have (i) an intercept-free design (i.e. ~ 0 + factor) or you have a categorical factor of interest with more than 2 values you should use the `contrasts` argument.") + warning("tidybulk says: If you have (i) an intercept-free design", + "(i.e. ~ 0 + factor) or you have a categorical factor of interest with", + "more than 2 values you should use the `contrasts` argument.") my_contrasts = .contrasts %>% @@ -511,7 +532,8 @@ get_differential_transcript_abundance_bulk <- function(.data, # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") + message("tidybulk says: Installing edgeR needed for differential", + "transcript abundance analyses") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) @@ -534,7 +556,8 @@ get_differential_transcript_abundance_bulk <- function(.data, !quo_is_null(.sample_total_read_count) ~ { # New library size dataset - new_lib_size = .data %>% pivot_sample(!!.sample) %>% select(!!.sample, !!.sample_total_read_count) + new_lib_size = .data %>% pivot_sample(!!.sample) %>% + select(!!.sample, !!.sample_total_read_count) x = (.) x$samples$lib.size = @@ -549,15 +572,20 @@ get_differential_transcript_abundance_bulk <- function(.data, # Scale data if method is not "none" when( - scaling_method != "none" ~ (.) %>% edgeR::calcNormFactors(method = scaling_method), + scaling_method != "none" ~ (.) %>% + edgeR::calcNormFactors(method = scaling_method), ~ (.) ) %>% # select method when( - tolower(method) == "edger_likelihood_ratio" ~ (.) %>% edgeR::estimateDisp(design) %>% edgeR::glmFit(design), - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::estimateDisp(design) %>% edgeR::glmQLFit(design), - tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% edgeR::estimateGLMRobustDisp(design) %>% edgeR::glmFit(design) + tolower(method) == "edger_likelihood_ratio" ~ (.) %>% + edgeR::estimateDisp(design) %>% edgeR::glmFit(design), + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::estimateDisp(design) %>% edgeR::glmQLFit(design), + tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% + edgeR::estimateGLMRobustDisp(design) %>% + edgeR::glmFit(design) ) @@ -572,9 +600,14 @@ get_differential_transcript_abundance_bulk <- function(.data, # select method when( - !is.null(test_above_log2_fold_change) ~ (.) %>% edgeR::glmTreat(coef = 2, contrast = my_contrasts, lfc=test_above_log2_fold_change), - tolower(method) %in% c("edger_likelihood_ratio", "edger_robust_likelihood_ratio") ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts) , - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts) + !is.null(test_above_log2_fold_change) ~ (.) %>% + edgeR::glmTreat(coef = 2, contrast = my_contrasts, + lfc=test_above_log2_fold_change), + tolower(method) %in% c("edger_likelihood_ratio", + "edger_robust_likelihood_ratio") ~ (.) %>% + edgeR::glmLRT(coef = 2, contrast = my_contrasts), + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::glmQLFTest(coef = 2, contrast = my_contrasts) ) %>% # Convert to tibble @@ -598,9 +631,14 @@ get_differential_transcript_abundance_bulk <- function(.data, # select method when( - !is.null(test_above_log2_fold_change) ~ (.) %>% edgeR::glmTreat(coef = 2, contrast = my_contrasts[, .x], lfc=test_above_log2_fold_change), - tolower(method) %in% c("edger_likelihood_ratio", "edger_robust_likelihood_ratio") ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts[, .x]) , - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts[, .x]) + !is.null(test_above_log2_fold_change) ~ (.) %>% + edgeR::glmTreat(coef = 2, contrast = my_contrasts[, .x], + lfc=test_above_log2_fold_change), + tolower(method) %in% c("edger_likelihood_ratio", + "edger_robust_likelihood_ratio") ~ (.) %>% + edgeR::glmLRT(coef = 2, contrast = my_contrasts[, .x]) , + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::glmQLFTest(coef = 2, contrast = my_contrasts[, .x]) ) %>% # Convert to tibble @@ -629,12 +667,16 @@ get_differential_transcript_abundance_bulk <- function(.data, # select method when( - tolower(method) == "edger_likelihood_ratio" ~ (.) %>% memorise_methods_used(c("edger", "edgeR_likelihood_ratio")), - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% memorise_methods_used(c("edger", "edgeR_quasi_likelihood")), - tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% memorise_methods_used(c("edger", "edger_robust_likelihood_ratio")) + tolower(method) == "edger_likelihood_ratio" ~ (.) %>% + memorise_methods_used(c("edger", "edgeR_likelihood_ratio")), + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + memorise_methods_used(c("edger", "edgeR_quasi_likelihood")), + tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% + memorise_methods_used(c("edger", "edger_robust_likelihood_ratio")) ) %>% when( - !is.null(test_above_log2_fold_change) ~ (.) %>% memorise_methods_used("treat"), + !is.null(test_above_log2_fold_change) ~ (.) %>% + memorise_methods_used("treat"), ~ (.) ) %>% @@ -642,8 +684,9 @@ get_differential_transcript_abundance_bulk <- function(.data, attach_to_internals(edgeR_object, "edgeR") %>% # Communicate the attribute added { - - rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$edgeR`", .frequency_id = "Access DE results edgeR", .frequency = "once") + rlang::inform(paste0("tidybulk says: to access the raw results (fitted GLM)", + " do `attr(..., \"internals\")$edgeR`"), + .frequency_id = "Access DE results edgeR", .frequency = "once") (.) } @@ -668,33 +711,41 @@ get_differential_transcript_abundance_bulk <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables +#' @param .formula a formula with no response variable, referring only to +#' numeric variables #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column #' @param .contrasts A character vector. Not used for this method -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param test_above_log2_fold_change A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +#' "edgeR_likelihood_ratio" (i.e., LRT) +#' @param test_above_log2_fold_change A positive real value. This works for +#' edgeR and limma_voom methods. It uses the `treat` function, which tests +#' that the difference in abundance is bigger than this threshold rather than +#' zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE", +#' "upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' @param .sample_total_read_count #' #' @return A tibble with glmmSeq results #' get_differential_transcript_abundance_glmmSeq <- function(.data, - .formula, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - .contrasts = NULL, - method , - test_above_log2_fold_change = NULL, - scaling_method = NULL, - omit_contrast_in_colnames = FALSE, - prefix = "", - .sample_total_read_count = NULL, - .dispersion = NULL, - ...) { + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + .contrasts = NULL, + method , + test_above_log2_fold_change = NULL, + scaling_method = NULL, + omit_contrast_in_colnames = FALSE, + prefix = "", + .sample_total_read_count = NULL, + .dispersion = NULL, + ...) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -704,7 +755,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ - warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") + warning("tidybulk says: you can omit contrasts in column names only", + "when maximum one contrast is present") omit_contrast_in_colnames = FALSE } @@ -738,7 +790,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Check if package is installed, otherwise install if (find.package("edgeR", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing edgeR needed for differential transcript abundance analyses") + message("tidybulk says: Installing edgeR needed for differential", + " transcript abundance analyses") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("edgeR", ask = FALSE) @@ -746,7 +799,8 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, # Check if package is installed, otherwise install if (find.package("glmmSeq", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing glmmSeq needed for differential transcript abundance analyses") + message("tidybulk says: Installing glmmSeq needed for differential ", + "transcript abundance analyses") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("glmmSeq", ask = FALSE) @@ -774,9 +828,12 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, ) if(quo_is_symbolic(.dispersion)) - dispersion = .data |> pivot_transcript(!!.transcript) |> select(!!.transcript, !!.dispersion) |> deframe() + dispersion = .data |> pivot_transcript(!!.transcript) |> + select(!!.transcript, !!.dispersion) |> deframe() else - dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts)) + dispersion = counts |> + edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> + setNames(rownames(counts)) # # Check dispersion # if(!names(dispersion) |> sort() |> identical( @@ -805,7 +862,9 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, glmmSeq_object |> summary_lmmSeq() |> as_tibble(rownames = "gene") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + mutate(across(starts_with("P_"), + list(adjusted = function(x) p.adjust(x, method="BH")), + .names = "{.col}_{.fn}")) |> # Attach attributes reattach_internals(.data) %>% @@ -817,9 +876,9 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, attach_to_internals(glmmSeq_object, "glmmSeq") %>% # Communicate the attribute added { - - rlang::inform("\ntidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$glmmSeq`", .frequency_id = "Access DE results glmmSeq", .frequency = "once") - + rlang::inform(paste0("\ntidybulk says: to access the raw results", + " (fitted GLM) do `attr(..., \"internals\")$glmmSeq`"), + .frequency_id = "Access DE results glmmSeq", .frequency = "once") (.) } %>% @@ -846,28 +905,34 @@ get_differential_transcript_abundance_glmmSeq <- function(.data, #' @importFrom dplyr arrange #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables +#' @param .formula a formula with no response variable, +#' referring only to numeric variables #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param .contrasts A character vector. See voom makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param .contrasts A character vector. See voom makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) #' @param method A string character. Either "limma_voom", "limma_voom_sample_weights" -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE", +#' "upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' #' @return A tibble with voom results #' get_differential_transcript_abundance_bulk_voom <- function(.data, - .formula, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - .contrasts = NULL, - method = NULL, - test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "") { + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + .contrasts = NULL, + method = NULL, + test_above_log2_fold_change = NULL, + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "") { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -875,7 +940,8 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ - warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") + warning("tidybulk says: you can omit contrasts in column names ", + "only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } @@ -897,7 +963,10 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, design = model.matrix( object = .formula, - data = df_for_voom %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample) + data = df_for_voom %>% + select(!!.sample, any_of(parse_formula(.formula))) %>% + distinct %>% + arrange(!!.sample) ) # Print the design column names in case I want contrasts @@ -916,7 +985,8 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, # Check if package is installed, otherwise install if (find.package("limma", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing limma needed for differential transcript abundance analyses") + message("tidybulk says: Installing limma needed for differential ", + "transcript abundance analyses") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("limma", ask = FALSE) @@ -932,14 +1002,17 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, # Scale data if method is not "none" when( - scaling_method != "none" ~ (.) %>% edgeR::calcNormFactors(method = scaling_method), + scaling_method != "none" ~ (.) %>% + edgeR::calcNormFactors(method = scaling_method), ~ (.) ) %>% # select method when( - tolower(method) == "limma_voom" ~ (.) %>% limma::voom(design, plot=FALSE), - tolower(method) == "limma_voom_sample_weights" ~ (.) %>% limma::voomWithQualityWeights(design, plot=FALSE) + tolower(method) == "limma_voom" ~ (.) %>% + limma::voom(design, plot=FALSE), + tolower(method) == "limma_voom_sample_weights" ~ (.) %>% + limma::voomWithQualityWeights(design, plot=FALSE) ) %>% limma::lmFit(design) @@ -954,7 +1027,8 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, ~ .x %>% # Contrasts - limma::contrasts.fit(contrasts=my_contrasts, coefficients = when(my_contrasts, is.null(.) ~ 2)) %>% + limma::contrasts.fit(contrasts=my_contrasts, + coefficients = when(my_contrasts, is.null(.) ~ 2)) %>% limma::eBayes() %>% when( @@ -1019,11 +1093,14 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, # select method when( - tolower(method) == "limma_voom" ~ (.) %>% memorise_methods_used("voom"), - tolower(method) == "limma_voom_sample_weights" ~ (.) %>% memorise_methods_used("voom_sample_weights") + tolower(method) == "limma_voom" ~ (.) %>% + memorise_methods_used("voom"), + tolower(method) == "limma_voom_sample_weights" ~ (.) %>% + memorise_methods_used("voom_sample_weights") ) %>% when( - !is.null(test_above_log2_fold_change) ~ (.) %>% memorise_methods_used("treat"), + !is.null(test_above_log2_fold_change) ~ (.) %>% + memorise_methods_used("treat"), ~ (.) ) %>% @@ -1031,7 +1108,9 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, attach_to_internals(voom_object, "voom") %>% # Communicate the attribute added { - rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$voom`", .frequency_id = "Access DE results voom", .frequency = "once") + rlang::inform(paste0("tidybulk says: to access the raw results ", + "(fitted GLM) do `attr(..., \"internals\")$voom`"), + .frequency_id = "Access DE results voom", .frequency = "once") (.) } @@ -1054,37 +1133,46 @@ get_differential_transcript_abundance_bulk_voom <- function(.data, #' @importFrom dplyr mutate_if #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables +#' @param .formula a formula with no response variable, referring only to +#' numeric variables #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" +#' (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; +#' "TMM","TMMwsp","RLE","upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' @param ... Additional arguments for DESeq2 #' #' @return A tibble with DESeq2 results #' get_differential_transcript_abundance_deseq2 <- function(.data, - .formula, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - .contrasts = NULL, - method = "deseq2", - test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "", - ...) { + .formula, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + .contrasts = NULL, + method = "deseq2", + test_above_log2_fold_change = NULL, + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "", + ...) { # Check if contrasts are of the same form if( .contrasts %>% is.null %>% not() & .contrasts %>% is("list") %>% not() ) - stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") + stop("tidybulk says: for DESeq2 the list of constrasts should be given in ", + "the form list(c(\"condition_column\",\"condition1\",\"condition2\")) ", + "i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") # Get column names .sample = enquo(.sample) @@ -1093,13 +1181,15 @@ get_differential_transcript_abundance_deseq2 <- function(.data, # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ - warning("tidybulk says: you can omit contrasts in column names only when maximum one contrast is present") + warning("tidybulk says: you can omit contrasts in column names ", + "only when maximum one contrast is present") omit_contrast_in_colnames = FALSE } # Check if package is installed, otherwise install if (find.package("DESeq2", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing DESeq2 needed for differential transcript abundance analyses") + message("tidybulk says: Installing DESeq2 needed for differential ", + "transcript abundance analyses") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("DESeq2", ask = FALSE) @@ -1173,7 +1263,8 @@ get_differential_transcript_abundance_deseq2 <- function(.data, # Simple comparison discrete my_contrasts %>% is.null %>% not() & omit_contrast_in_colnames ~ (.) %>% - DESeq2::results(contrast = my_contrasts[[1]], lfcThreshold=test_above_log2_fold_change)%>% + DESeq2::results(contrast = my_contrasts[[1]], + lfcThreshold=test_above_log2_fold_change)%>% as_tibble(rownames = quo_name(.transcript)), # Multiple comparisons NOT USED AT THE MOMENT @@ -1185,11 +1276,13 @@ get_differential_transcript_abundance_deseq2 <- function(.data, ~ deseq2_obj %>% # select method - DESeq2::results(contrast = my_contrasts[[.x]], lfcThreshold=test_above_log2_fold_change) %>% + DESeq2::results(contrast = my_contrasts[[.x]], + lfcThreshold=test_above_log2_fold_change) %>% # Convert to tibble as_tibble(rownames = quo_name(.transcript)) %>% - mutate(constrast = sprintf("%s %s-%s", my_contrasts[[.x]][1], my_contrasts[[.x]][2], my_contrasts[[.x]][3]) ) + mutate(constrast = sprintf("%s %s-%s", my_contrasts[[.x]][1], + my_contrasts[[.x]][2], my_contrasts[[.x]][3])) ) %>% pivot_wider(values_from = -c(!!.transcript, constrast), @@ -1212,8 +1305,9 @@ get_differential_transcript_abundance_deseq2 <- function(.data, # Communicate the attribute added { - - rlang::inform("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$DESeq2`", .frequency_id = "Access DE results deseq2", .frequency = "once") + rlang::inform(paste0("tidybulk says: to access the raw results ", + "(fitted GLM) do `attr(..., \"internals\")$DESeq2`"), + .frequency_id = "Access DE results deseq2", .frequency = "once") (.) } @@ -1240,12 +1334,15 @@ get_differential_transcript_abundance_deseq2 <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables +#' @param .formula a formula with no response variable, referring +#' only to numeric variables #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance +#' @param method A string character. Either "edgeR_quasi_likelihood" +#' (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) +#' @param reference A data frame. The transcript/cell_type data frame of +#' integer transcript abundance #' @param significance_threshold A real between 0 and 1 #' #' @return A tibble with edgeR results @@ -1306,7 +1403,8 @@ test_differential_cellularity_ <- function(.data, .formula %>% when( # If I have the dot, needed definitely for censored - format(.) %>% grepl("\\.", .) %>% any ~ format(.) %>% str_replace("([-\\+\\*~ ]?)(\\.)", "\\1.proportion_0_corrected"), + format(.) %>% grepl("\\.", .) %>% any ~ format(.) %>% + str_replace("([-\\+\\*~ ]?)(\\.)", "\\1.proportion_0_corrected"), # If normal formula ~ sprintf(".proportion_0_corrected%s", format(.)) @@ -1316,16 +1414,17 @@ test_differential_cellularity_ <- function(.data, # Test result = univariable_differential_tissue_composition(deconvoluted, - method, - .my_formula, - min_detected_proportion) %>% + method, + .my_formula, + min_detected_proportion) %>% # Attach attributes reattach_internals(.data) %>% # Add methods used when( - grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot")), + grepl("Surv", .my_formula) ~ (.) %>% + memorise_methods_used(c("survival", "boot")), ~ (.) %>% memorise_methods_used("betareg") ) } else { @@ -1366,7 +1465,9 @@ test_differential_cellularity_ <- function(.data, reattach_internals(.data) %>% # Add methods used - when(grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot"), object_containing_methods = .data), + when(grepl("Surv", .my_formula) ~ (.) %>% + memorise_methods_used(c("survival", "boot"), + object_containing_methods = .data), ~ (.)) } @@ -1401,12 +1502,15 @@ test_differential_cellularity_ <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables +#' @param .formula a formula with no response variable, referring +#' only to numeric variables #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance +#' @param method A string character. Either "edgeR_quasi_likelihood" +#' (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) +#' @param reference A data frame. The transcript/cell_type data frame of +#' integer transcript abundance #' @param significance_threshold A real between 0 and 1 #' #' @return A tibble with edgeR results @@ -1461,7 +1565,8 @@ test_stratification_cellularity_ <- function(.data, reattach_internals(.data) %>% # Add methods used - memorise_methods_used(c("survival", "boot", "survminer"), object_containing_methods = .data) + memorise_methods_used(c("survival", "boot", "survminer"), + object_containing_methods = .data) } %>% # Eliminate prefix @@ -1486,29 +1591,42 @@ test_stratification_cellularity_ <- function(.data, #' @importFrom utils install.packages #' #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula with no response variable, representing the desired linear model +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula with no response variable, representing the +#' desired linear model #' @param .sample The name of the sample column #' @param .entrez The ENTREZ code of the transcripts/genes #' @param .abundance The name of the transcript/gene abundance column -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param methods A character vector. One or 3 or more methods to use in the testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() to see the supported GSE methods. -#' @param gene_sets A character vector or a list. It can take one or more of the following built-in collections as a character vector: c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. Alternatively, a list of user-supplied gene sets can be provided, to be used with EGSEA buildCustomIdx. In that case, each gene set is a character vector of Entrez IDs and the names of the list are the gene set names. +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param methods A character vector. One or 3 or more methods to use in the +#' testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() +#' to see the supported GSE methods. +#' @param gene_sets A character vector or a list. It can take one or more of +#' the following built-in collections as a character vector: c("h", "c1", "c2", +#' "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", +#' "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. +#' Alternatively, a list of user-supplied gene sets can be provided, to be +#' used with EGSEA buildCustomIdx. In that case, each gene set is a character +#' vector of Entrez IDs and the names of the list are the gene set names. #' @param species A character. It can be human, mouse or rat. #' @param cores An integer. The number of cores available #' #' @return A tibble with edgeR results #' test_gene_enrichment_bulk_EGSEA <- function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - .contrasts = NULL, - methods, - gene_sets, - species, - cores = 10) { + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + .contrasts = NULL, + methods, + gene_sets, + species, + cores = 10) { # Comply with CRAN NOTES . = NULL @@ -1538,14 +1656,18 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, pull(n) %>% min %>% st(2)) - stop("tidybulk says: You need at least two replicates for each condition for EGSEA to work") + stop("tidybulk says: You need at least two replicates for each", + " condition for EGSEA to work") # Create design matrix design = model.matrix( object = .formula, - data = df_for_edgeR %>% select(!!.sample, any_of(parse_formula(.formula))) %>% distinct %>% arrange(!!.sample) + data = df_for_edgeR %>% + select(!!.sample, any_of(parse_formula(.formula))) %>% + distinct %>% + arrange(!!.sample) ) # Print the design column names in case I want contrasts @@ -1564,13 +1686,13 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, # Check if package is installed, otherwise install if (find.package("EGSEA", quiet = TRUE) %>% length %>% equals(0)) { - stop(" - EGSEA not installed. Please install it. EGSEA requires manual installation to not overwhelm the user in case it is not needed. - BiocManager::install(\"EGSEA\", ask = FALSE) - ") + stop("EGSEA not installed. Please install it. EGSEA requires manual ", + "installation to not overwhelm the user in case it is not needed.", + "BiocManager::install(\"EGSEA\", ask = FALSE)") } if (!"EGSEA" %in% (.packages())) { - stop("EGSEA package not loaded. Please run library(\"EGSEA\"). With this setup, EGSEA require manual loading, for technical reasons.") + stop("EGSEA package not loaded. Please run library(\"EGSEA\"). ", + "With this setup, EGSEA require manual loading, for technical reasons.") } dge = @@ -1590,7 +1712,8 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, if (is.list(gene_sets)) { - idx = buildCustomIdx(geneIDs = rownames(dge), species = species, gsets=gene_sets) + idx = buildCustomIdx(geneIDs = rownames(dge), + species = species, gsets=gene_sets) nonkegg_genesets = idx kegg_genesets = NULL @@ -1621,7 +1744,8 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, } - idx = buildIdx(entrezIDs = rownames(dge), species = species, msigdb.gsets = msigdb.gsets, + idx = buildIdx(entrezIDs = rownames(dge), species = species, + msigdb.gsets = msigdb.gsets, kegg.exclude = kegg.exclude) # Due to a bug with kegg pathview overlays, this collection is run without report @@ -1673,7 +1797,9 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, } if (length(kegg_genesets) != 0) { - message("tidybulk says: due to a bug in the call to KEGG database (http://supportupgrade.bioconductor.org/p/122172/#122218), the analysis for this database is run without report production.") + message("tidybulk says: due to a bug in the call to KEGG database ", + "(http://supportupgrade.bioconductor.org/p/122172/#122218), ", + "the analysis for this database is run without report production.") res_kegg = dge %>% @@ -1716,7 +1842,8 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, # add to bibliography if (exists("collections_bib")) { - out %>% memorise_methods_used(c("egsea", collections_bib, methods), object_containing_methods = .data) + out %>% memorise_methods_used(c("egsea", collections_bib, methods), + object_containing_methods = .data) } } @@ -1732,11 +1859,15 @@ test_gene_enrichment_bulk_EGSEA <- function(.data, #' @importFrom rlang := #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes) +#' @param .abundance A column symbol with the value the clustering is based +#' on (e.g., `count`) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally samples) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally genes) #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be log-transformed +#' (e.g., TRUE for RNA sequencing data) #' @param ... Further parameters passed to the function kmeans #' #' @return A tibble with additional columns @@ -1753,7 +1884,8 @@ get_clusters_kmeans_bulk <- # Check if centers is in dots dots_args = rlang::dots_list(...) if ("centers" %in% names(dots_args) %>% not()) - stop("tidybulk says: for kmeans you need to provide the \"centers\" integer argument") + stop('tidybulk says: for kmeans you need to provide the ', + '"centers" integer argument') # Get column names .element = enquo(.element) @@ -1800,11 +1932,15 @@ get_clusters_kmeans_bulk <- #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes) +#' @param .abundance A column symbol with the value the clustering +#' is based on (e.g., `count`) +#' @param .feature A column symbol. The column that is represents entities +#' to cluster (i.e., normally samples) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally genes) #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' @param ... Further parameters passed to the function kmeans #' #' @return A tibble with additional columns @@ -1883,13 +2019,18 @@ get_clusters_SNN_bulk <- #' @importFrom rlang inform #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering +#' is based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' #' @return A tibble with additional columns #' @@ -1916,7 +2057,8 @@ get_reduced_dimensions_MDS_bulk <- # Convert components to components list - if((length(components) %% 2) != 0 ) components = components %>% append(components[1]) + if((length(components) %% 2) != 0 ) components = components %>% + append(components[1]) components_list = split(components, ceiling(seq_along(components)/2)) # Loop over components list and calculate MDS. (I have to make this process more elegant) @@ -1933,7 +2075,8 @@ get_reduced_dimensions_MDS_bulk <- # Stop any column is not if not numeric or integer ifelse_pipe( - (.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% `!`() %>% any(), + (.) %>% select(!!.abundance) %>% summarise_all(class) %>% + `%in%`(c("numeric", "integer")) %>% `!`() %>% any(), ~ stop(".abundance must be numerical or integer") ) %>% spread(!!.element,!!.abundance) %>% @@ -1977,8 +2120,9 @@ get_reduced_dimensions_MDS_bulk <- attach_to_internals(mds_object, "MDS") %>% # Communicate the attribute added { - - rlang::inform("tidybulk says: to access the raw results do `attr(..., \"internals\")$MDS`", .frequency_id = "Access MDS results", .frequency = "once") + rlang::inform(paste0("tidybulk says: to access the raw results do ", + "`attr(..., \"internals\")$MDS`"), + .frequency_id = "Access MDS results", .frequency = "once") (.) } @@ -1999,13 +2143,18 @@ get_reduced_dimensions_MDS_bulk <- #' @importFrom rlang inform #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering is +#' based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities +#' to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' @param scale A boolean #' @param ... Further parameters passed to the function prcomp #' @@ -2050,7 +2199,8 @@ get_reduced_dimensions_PCA_bulk <- # Stop any column is not if not numeric or integer ifelse_pipe( - (.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer", "bouble")) %>% not() %>% any(), + (.) %>% select(!!.abundance) %>% summarise_all(class) %>% + `%in%`(c("numeric", "integer", "bouble")) %>% not() %>% any(), ~ stop("tidybulk says: .abundance must be numerical or integer") ) %>% @@ -2067,16 +2217,16 @@ get_reduced_dimensions_PCA_bulk <- # First function ~ stop( - "tidybulk says: In calculating PCA there is no gene that have non NA values is all samples" + "tidybulk says: In calculating PCA there is no gene that have ", + "non NA values is all samples" ), # Second function ~ { warning( - " - tidybulk says: In PCA correlation there is < 100 genes that have non NA values is all samples. -The correlation calculation would not be reliable, -we suggest to partition the dataset for sample clusters. + "tidybulk says: In PCA correlation there is < 100 genes that have ", + "non NA values is all samples. The correlation calculation would ", + "not be reliable, we suggest to partition the dataset for sample clusters. " ) .x @@ -2123,11 +2273,12 @@ we suggest to partition the dataset for sample clusters. attach_to_internals(prcomp_obj, "PCA") %>% # Communicate the attribute added { - rlang::inform("tidybulk says: to access the raw results do `attr(..., \"internals\")$PCA`", .frequency_id = "Access PCA results", .frequency = "once") + rlang::inform(paste0("tidybulk says: to access the raw results do ", + "`attr(..., \"internals\")$PCA`"), + .frequency_id = "Access PCA results", .frequency = "once") (.) } - } #' Get tSNE @@ -2143,13 +2294,18 @@ we suggest to partition the dataset for sample clusters. #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the +#' clustering is based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal +#' components of interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents +#' entities to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to +#' calculate distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' @param ... Further parameters passed to the function Rtsne #' #' @return A tibble with additional columns @@ -2190,10 +2346,11 @@ get_reduced_dimensions_TSNE_bulk <- } # Set perprexity to not be too high - if (!"perplexity" %in% names(arguments)) - arguments = arguments %>% c(perplexity = (( - .data %>% distinct(!!.element) %>% nrow() %>% sum(-1) - ) / 3 / 2) %>% floor() %>% min(30)) + if (!"perplexity" %in% names(arguments)) { + arguments <- arguments %>% c(perplexity = (( + .data %>% distinct(!!.element) %>% nrow() %>% sum(-1) + ) / 3 / 2) %>% floor(.) %>% min(30)) + } # If not enough samples stop if (arguments$perplexity <= 2) @@ -2256,14 +2413,21 @@ get_reduced_dimensions_TSNE_bulk <- #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering +#' is based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components +#' of interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities +#' to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) -#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered +#' @param log_transform A boolean, whether the value should be log-transformed +#' (e.g., TRUE for RNA sequencing data) +#' @param calculate_for_pca_dimensions An integer of length one. The number of +#' PCA dimensions to based the UMAP calculatio on. +#' If NULL all variable features are considered #' @param ... Further parameters passed to the function uwot #' #' @return A tibble with additional columns @@ -2286,7 +2450,8 @@ get_reduced_dimensions_UMAP_bulk <- !is(calculate_for_pca_dimensions, "numeric") | length(calculate_for_pca_dimensions) > 1 )) - stop("tidybulk says: the argument calculate_for_pca_dimensions should be NULL or an integer of size 1") + stop("tidybulk says: the argument calculate_for_pca_dimensions ", + "should be NULL or an integer of size 1") # Comply with CRAN NOTES . = NULL @@ -2384,10 +2549,13 @@ get_reduced_dimensions_UMAP_bulk <- #' @param dimension_1_column A column symbol. The column of the dimension 1 #' @param dimension_2_column A column symbol. The column of the dimension 2 #' @param rotation_degrees A real number between 0 and 360 -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .element A column symbol. The column that is used to +#' calculate distance (i.e., normally samples) #' @param of_samples A boolean -#' @param dimension_1_column_rotated A column symbol. The column of the dimension 1 rotated -#' @param dimension_2_column_rotated A column symbol. The column of the dimension 2 rotated +#' @param dimension_1_column_rotated A column symbol. +#' The column of the dimension 1 rotated +#' @param dimension_2_column_rotated A column symbol. +#' The column of the dimension 2 rotated #' #' @return A tibble with additional rotated columns #' @@ -2452,7 +2620,8 @@ get_rotated_dimensions = } #' Aggregates multiple counts from the same samples (e.g., from isoforms) -#' This function aggregates counts over samples, concatenates other character columns, and averages other numeric columns +#' This function aggregates counts over samples, +#' concatenates other character columns, and averages other numeric columns #' #' @keywords internal #' @noRd @@ -2565,23 +2734,31 @@ aggregate_duplicated_transcripts_bulk = # Through warning if there are logicals of factor in the data frame # because they cannot be merged if they are not unique - if (length(columns_to_be_converted)>0 & filter(count_duplicates, n_aggr>1) %>% nrow() %>% gt(0)) { + if (length(columns_to_be_converted)>0 & filter(count_duplicates, n_aggr>1) + %>% nrow() %>% gt(0)) { warning(paste(capture.output({ - cat(crayon::blue("tidybulk says: The following columns were converted to characters, as aggregating those classes with concatenation is not possible.\n")) + cat(crayon::blue("tidybulk says: The following columns were converted ", + "to characters, as aggregating those classes with ", + "concatenation is not possible.\n")) print(.data %>% select(columns_to_be_converted)) }), collapse = "\n")) } # Through warning if there are logicals of factor in the data frame # because they cannot be merged if they are not unique - if (length(non_standard_columns)>0 & filter(count_duplicates, n_aggr>1) %>% nrow() %>% gt(0)) { + if (length(non_standard_columns)>0 & filter(count_duplicates, n_aggr>1) %>% + nrow() %>% gt(0)) { warning(paste(capture.output({ - cat(crayon::blue("tidybulk says: If duplicates exist from the following columns, only the first instance was taken (lossy behaviour), as aggregating those classes with concatenation is not possible.\n")) + cat(crayon::blue("tidybulk says: If duplicates exist from the ", + "following columns, only the first instance was taken ", + "(lossy behaviour), as aggregating those classes with ", + "concatenation is not possible.\n")) print(.data %>% select(non_standard_columns)) }), collapse = "\n")) } - # aggregates read .data over samples, concatenates other character columns, and averages other numeric columns + # aggregates read .data over samples, concatenates other character + # columns, and averages other numeric columns .data %>% # transform logicals and factors @@ -2589,7 +2766,8 @@ aggregate_duplicated_transcripts_bulk = mutate_if(is.logical, as.character) %>% # Add the number of duplicates for each gene - dplyr::left_join(count_duplicates, by = c(quo_name(.sample), quo_name(.transcript))) %>% + dplyr::left_join(count_duplicates, + by = c(quo_name(.sample), quo_name(.transcript))) %>% # Anonymous function - binds the unique and the reduced genes, # in the way we have to reduce redundancy just for the duplicated genes @@ -2625,7 +2803,8 @@ aggregate_duplicated_transcripts_bulk = } #' Aggregates multiple counts from the same samples (e.g., from isoforms) -#' This function aggregates counts over samples, concatenates other character columns, and averages other numeric columns +#' This function aggregates counts over samples, concatenates other +#' character columns, and averages other numeric columns #' #' @keywords internal #' @noRd @@ -2670,7 +2849,9 @@ aggregate_duplicated_transcripts_DT = # tidybulk::scale_abundance(.sample_ = sample, .abundance_ = abundance, .transcript_ = transcript) if(.data %>% filter(is.na(!!.transcript_)) %>% nrow() %>% gt(0)){ - warning(sprintf("tidybulk says: some of your %s are NAs. Those will be eliminated to correctly aggregate the duplicates", quo_name(.transcript_))) + warning(sprintf(paste0("tidybulk says: some of your %s are NAs.", + "Those will be eliminated to correctly aggregate the duplicates"), + quo_name(.transcript_))) .data = .data %>% filter(!is.na(!!.transcript_)) } # Select which are the numerical columns @@ -2682,9 +2863,9 @@ aggregate_duplicated_transcripts_DT = # If scaled add the column to the exclusion ifelse_pipe(( - ".abundance_scaled" %in% (.data %>% get_tt_columns() %>% names) && - # .data %>% get_tt_columns() %$% .abundance_scaled %>% is.null %>% not() && - quo_name(.data %>% get_tt_columns() %$% .abundance_scaled) %in% (.data %>% colnames) + ".abundance_scaled" %in% (.data %>% get_tt_columns() %>% names) && + # .data %>% get_tt_columns() %$% .abundance_scaled %>% is.null %>% not() && + quo_name(.data %>% get_tt_columns() %$% .abundance_scaled) %in% (.data %>% colnames) ), ~ .x %>% select(-!!( .data %>% get_tt_columns() %$% .abundance_scaled @@ -2700,7 +2881,9 @@ aggregate_duplicated_transcripts_DT = ~ (.) ) - pasted_strings___ = stringi::stri_c(pull(.data,quo_name(.transcript_)), pull(.data,quo_name(.sample_)), sep = "_") + pasted_strings___ = stringi::stri_c(pull(.data,quo_name(.transcript_)), + pull(.data,quo_name(.sample_)), + sep = "_") #.data = .data %>% mutate(pasted_strings___ = pasted_strings___) duplicates = pasted_strings___%in%pasted_strings___[which(duplicated(pasted_strings___))] @@ -2733,7 +2916,8 @@ aggregate_duplicated_transcripts_DT = } -#' Drop redundant elements (e.g., samples) for which feature (e.g., genes) aboundances are correlated +#' Drop redundant elements (e.g., samples) for which feature +#' (e.g., genes) aboundances are correlated #' #' @keywords internal #' @noRd @@ -2745,25 +2929,30 @@ aggregate_duplicated_transcripts_DT = #' @importFrom dplyr anti_join #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) +#' @param .abundance A column symbol with the value the clustering +#' is based on (e.g., `count`) #' @param correlation_threshold A real number between 0 and 1 #' @param top An integer. How many top genes to select -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .feature A column symbol. The column that is represents entities +#' to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param of_samples A boolean -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be log-transformed +#' (e.g., TRUE for RNA sequencing data) #' #' @return A tibble with redundant elements removed #' #' -remove_redundancy_elements_through_correlation <- function(.data, - .element = NULL, - .feature = NULL, - .abundance = NULL, - correlation_threshold = 0.9, - top = Inf, - of_samples = TRUE, - transform = identity) { +remove_redundancy_elements_through_correlation <- function( + .data, + .element = NULL, + .feature = NULL, + .abundance = NULL, + correlation_threshold = 0.9, + top = Inf, + of_samples = TRUE, + transform = identity) { # Comply with CRAN NOTES . = NULL @@ -2771,14 +2960,16 @@ remove_redundancy_elements_through_correlation <- function(.data, .element = enquo(.element) .feature = enquo(.feature) .abundance = enquo(.abundance) - col_names = get_elements_features_abundance(.data, .element, .feature, .abundance, of_samples) + col_names = get_elements_features_abundance(.data, .element, .feature, + .abundance, of_samples) .element = col_names$.element .feature = col_names$.feature .abundance = col_names$.abundance # Check if .data has more than one element if(.data %>% distinct(!!.element) %>% nrow() <= 1 ) - stop("tidybulk says: You must have more than one element (trancripts if of_samples == FALSE) to perform remove_redundancy") + stop("tidybulk says: You must have more than one element ", + "(trancripts if of_samples == FALSE) to perform remove_redundancy") # Check if package is installed, otherwise install if (find.package("widyr", quiet = TRUE) %>% length %>% equals(0)) { @@ -2866,7 +3057,8 @@ remove_redundancy_elements_through_correlation <- function(.data, #' @param .data A tibble #' @param Dim_a_column A column symbol. The column of one principal component #' @param Dim_b_column A column symbol. The column of another principal component -#' @param .element A column symbol. The column that is represents entities to cluster (i.e., normally samples) +#' @param .element A column symbol. The column that is represents entities +#' to cluster (i.e., normally samples) #' @param of_samples A boolean #' #' @return A tibble with pairs dropped @@ -2887,7 +3079,8 @@ remove_redundancy_elements_though_reduced_dimensions <- # Check if .data has more than one element if(.data %>% distinct(!!.element) %>% nrow() <= 1 ) - stop("tidybulk says: You must have more than one element (trancripts if of_samples == FALSE) to perform remove_redundancy") + stop("tidybulk says: You must have more than one element (trancripts ", + "if of_samples == FALSE) to perform remove_redundancy") Dim_a_column = enquo(Dim_a_column) Dim_b_column = enquo(Dim_b_column) @@ -3109,7 +3302,8 @@ run_epic = function(mix, reference = NULL) { # Check if package is installed, otherwise install if (find.package("devtools", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing class needed for EPIC") - install.packages("devtools", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports")) + install.packages("devtools", repos = "https://cloud.r-project.org", + dependencies = c("Depends", "Imports")) } # Check if package is installed, otherwise install @@ -3118,7 +3312,10 @@ run_epic = function(mix, reference = NULL) { devtools::install_github("GfellerLab/EPIC") } - if("EPIC" %in% .packages() %>% not) stop("tidybulk says: Please install and then load the package EPIC manually (i.e. library(EPIC)). This is because EPIC is not in Bioconductor or CRAN so it is not possible to seamlessly make EPIC part of the dependencies.") + if("EPIC" %in% .packages() %>% not) stop("tidybulk says: Please install and ", + "then load the package EPIC manually (i.e. library(EPIC)). ", + "This is because EPIC is not in Bioconductor or CRAN so it is not possible ", + "to seamlessly make EPIC part of the dependencies.") # Get common markers if( reference |> is("data.frame") | reference |> is("matrix")){ @@ -3163,9 +3360,13 @@ run_epic = function(mix, reference = NULL) { #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance -#' @param method A character string. The method to be used. At the moment Cibersort (default) and llsr (linear least squares regression) are available. -#' @param prefix A character string. The prefix you would like to add to the result columns. It is useful if you want to reshape data. +#' @param reference A data frame. The transcript/cell_type data frame of +#' integer transcript abundance +#' @param method A character string. The method to be used. +#' At the moment Cibersort (default) and llsr (linear least squares regression) +#' are available. +#' @param prefix A character string. The prefix you would like to add to the +#' result columns. It is useful if you want to reshape data. #' @param ... Further parameters passed to the function Cibersort #' #' @return A tibble including additional columns @@ -3208,13 +3409,15 @@ get_cell_type_proportions = function(.data, # Check if package is installed, otherwise install if (find.package("class", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing class needed for Cibersort") - install.packages("class", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports")) + install.packages("class", repos = "https://cloud.r-project.org", + dependencies = c("Depends", "Imports")) } # Check if package is installed, otherwise install if (find.package("e1071", quiet = TRUE) %>% length %>% equals(0)) { message("tidybulk says: Installing e1071 needed for Cibersort") - install.packages("e1071", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports")) + install.packages("e1071", repos = "https://cloud.r-project.org", + dependencies = c("Depends", "Imports")) } # Check if package is installed, otherwise install @@ -3232,7 +3435,8 @@ get_cell_type_proportions = function(.data, # Validate reference validate_signature(.data, reference, !!.transcript) - do.call(my_CIBERSORT, list(Y = ., X = reference, QN=FALSE) %>% c(dots_args)) %$% + do.call(my_CIBERSORT, list(Y = ., X = reference, QN=FALSE) %>% + c(dots_args)) %$% proportions %>% as_tibble(rownames = quo_name(.sample)) %>% select(-`P-value`,-Correlation,-RMSE) @@ -3273,7 +3477,9 @@ get_cell_type_proportions = function(.data, } if(method %in% c("mcp_counter", "quantiseq", "xcell") & !"immunedeconv" %in% (.packages())) - stop("tidybulk says: for xcell, mcp_counter, or quantiseq deconvolution you should have the package immunedeconv attached. Please execute library(immunedeconv)") + stop("tidybulk says: for xcell, mcp_counter, or quantiseq ", + "deconvolution you should have the package immunedeconv attached. ", + "Please execute library(immunedeconv)") (.) %>% deconvolute(method %>% tolower, tumor = FALSE) %>% @@ -3282,7 +3488,8 @@ get_cell_type_proportions = function(.data, }, ~ stop( - "tidybulk says: please choose between llsr, cibersort, epic, mcp_counter, quantiseq, and xcell methods" + "tidybulk says: please choose between llsr, cibersort, epic, ", + "mcp_counter, quantiseq, and xcell methods" ) ) %>% @@ -3316,24 +3523,26 @@ get_cell_type_proportions = function(.data, #' @importFrom stringr str_c #' #' @param .data A tibble -#' @param .formula a formula with no response variable, of the kind ~ factor_of_interest + batch +#' @param .formula a formula with no response variable, of the +#' kind ~ factor_of_interest + batch #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be log-transformed +#' (e.g., TRUE for RNA sequencing data) #' @param ... Further parameters passed to the function sva::ComBat #' #' @return A tibble with adjusted counts #' #' get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, - .factor_unwanted, - .factor_of_interest, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - method = "combat_seq", - ...) { + .factor_unwanted, + .factor_of_interest, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + method = "combat_seq", + ...) { # Get column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -3343,7 +3552,8 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, # Check if package is installed, otherwise install if (find.package("sva", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing sva - Combat needed for adjustment for unwanted variation") + message("tidybulk says: Installing sva - Combat needed for ", + "adjustment for unwanted variation") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("sva", ask = FALSE) @@ -3351,7 +3561,8 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, # New column name - value_adjusted = as.symbol(sprintf("%s%s", quo_name(.abundance), adjusted_string)) + value_adjusted = as.symbol(sprintf("%s%s", quo_name(.abundance), + adjusted_string)) df_for_combat <- .data %>% @@ -3367,9 +3578,12 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, # Create design matrix design = model.matrix( - object = as.formula(sprintf("~ %s", .data |> select(!!.factor_of_interest) |> colnames() |> str_c(collapse = '+'))), + object = as.formula(sprintf("~ %s", .data |> + select(!!.factor_of_interest) |> + colnames() |> str_c(collapse = '+'))), # get first argument of the .formula - data = df_for_combat %>% select(!!.sample, !!.factor_of_interest) %>% distinct %>% arrange(!!.sample) + data = df_for_combat %>% select(!!.sample, !!.factor_of_interest) %>% + distinct %>% arrange(!!.sample) ) my_batch = @@ -3386,7 +3600,8 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, # Stop any column is not if not numeric or integer ifelse_pipe( - (.) %>% select(!!.abundance) %>% summarise_all(class) %>% `%in%`(c("numeric", "integer")) %>% not() %>% any(), + (.) %>% select(!!.abundance) %>% summarise_all(class) %>% + `%in%`(c("numeric", "integer")) %>% not() %>% any(), ~ stop(".abundance must be numerical or integer") ) %>% @@ -3454,9 +3669,12 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, unwanted_covariate_matrix = model.matrix( - object = as.formula(sprintf("~ 0 + %s", .data |> select(!!.factor_unwanted) |> colnames() |> str_c(collapse = '+'))), + object = as.formula(sprintf("~ 0 + %s", .data |> + select(!!.factor_unwanted) |> colnames() |> + str_c(collapse = '+'))), # get first argument of the .formula - data = df_for_combat %>% select(!!.sample, !!.factor_unwanted) %>% distinct %>% arrange(!!.sample) + data = df_for_combat %>% select(!!.sample, !!.factor_unwanted) %>% + distinct %>% arrange(!!.sample) ) adjusted_df = @@ -3479,7 +3697,8 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, dplyr::mutate(!!.abundance := !!.abundance %>% as.integer) } else { - stop("tidybulk says: the argument \"method\" must be combat_seq, combat, or limma_remove_batch_effect") + stop("tidybulk says: the argument \"method\" must be combat_seq, combat, ", + "or limma_remove_batch_effect") } @@ -3503,7 +3722,8 @@ get_adjusted_counts_for_unwanted_variation_bulk <- function(.data, #' @param .transcript A character name of the transcript/gene column #' @param .abundance A character name of the read count column #' @param top An integer. How many top genes to select -#' @param log_transform A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform A boolean, whether the value should be log-transformed +#' (e.g., TRUE for RNA sequencing data) #' #' @return A tibble filtered genes #' @@ -3616,12 +3836,14 @@ tidybulk_to_SummarizedExperiment = function(.data, feature_cols = col_direction$vertical_cols counts_cols = col_direction$counts_cols - colData = .data %>% select(!!.sample, sample_cols) %>% distinct %>% arrange(!!.sample) %>% { + colData = .data %>% select(!!.sample, sample_cols) %>% distinct %>% + arrange(!!.sample) %>% { S4Vectors::DataFrame((.) %>% select(-!!.sample), row.names = (.) %>% pull(!!.sample)) } - rowData = .data %>% select(!!.transcript, feature_cols) %>% distinct %>% arrange(!!.transcript) %>% { + rowData = .data %>% select(!!.transcript, feature_cols) %>% distinct %>% + arrange(!!.transcript) %>% { S4Vectors::DataFrame((.) %>% select(-!!.transcript), row.names = (.) %>% pull(!!.transcript)) } @@ -3649,7 +3871,8 @@ tidybulk_to_SummarizedExperiment = function(.data, } -#' This function is needed for DE in case the matrix is not rectangular, but includes NA +#' This function is needed for DE in case the matrix is not rectangular, +#' but includes NA #' #' @keywords internal #' @noRd @@ -3665,7 +3888,8 @@ tidybulk_to_SummarizedExperiment = function(.data, #' @importFrom rlang quo_is_symbol #' #' @param .data A tibble -#' @param .formula a formula with no response variable, of the kind ~ factor_of_interest + batch +#' @param .formula a formula with no response variable, of the +#' kind ~ factor_of_interest + batch #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column @@ -3698,7 +3922,8 @@ fill_NA_using_formula = function(.data, colnames # Sample-wise columns - sample_col = .data %>% get_specific_annotation_columns(!!.sample) %>% outersect(col_formula) + sample_col = .data %>% get_specific_annotation_columns(!!.sample) %>% + outersect(col_formula) need_log = .data %>% pull(!!.abundance) %>% max(na.rm=TRUE) > 50 @@ -3708,10 +3933,12 @@ fill_NA_using_formula = function(.data, # Add missing pairs nest(ct_data = -c(col_formula)) %>% - mutate(ct_data = map(ct_data, ~ .x %>% droplevels() %>% complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>% + mutate(ct_data = map(ct_data, ~ .x %>% droplevels() %>% + complete(!!as.symbol(quo_name(.sample)), !!.transcript) )) %>% unnest(ct_data) - # For non scaled counts create a pseudo scale based on library size, then calculate imputed and scale back + # For non scaled counts create a pseudo scale based on library size, + # then calculate imputed and scale back abundance_is_int = .data %>% slice(1) %>% pull(!!.abundance) %>% is("integer") .data = .data %>% @@ -3726,7 +3953,9 @@ fill_NA_using_formula = function(.data, # Divide the dataset .data_OK = .data %>% - anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>% + anti_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% + select( !!.transcript, col_formula) %>% distinct(), + by = c(quo_name(.transcript), col_formula)) %>% # Add the imputed column mutate(!!as.symbol(imputed_column) := !!.abundance) %>% @@ -3737,7 +3966,9 @@ fill_NA_using_formula = function(.data, .data_FIXED = .data %>% - inner_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% select( !!.transcript, col_formula) %>% distinct(), by = c(quo_name(.transcript), col_formula)) %>% + inner_join(.data_completed %>% filter(!!.abundance %>% is.na) %>% + select( !!.transcript, col_formula) %>% distinct(), + by = c(quo_name(.transcript), col_formula)) %>% # attach NAs bind_rows( @@ -3759,8 +3990,13 @@ fill_NA_using_formula = function(.data, library_size = colSums(.x, na.rm = TRUE) .x = .x / library_size } - else message(sprintf("tidybulk says: %s appears not to be scaled for sequencing depth (missing _scaled suffix; if you think this column is idependent of sequencing depth ignore this message), therefore the imputation can produce non meaningful results if sequencing depth for samples are highly variable. If you use force_scaling = TRUE library size will be used for eliminatig some sequencig depth effect before imputation", .y)) - + else message(sprintf(paste0("tidybulk says: %s appears not to be scaled for ", + "sequencing depth (missing _scaled suffix; if you think this column is ", + "idependent of sequencing depth ignore this message), therefore the ", + "imputation can produce non meaningful results if sequencing depth for ", + "samples are highly variable. If you use force_scaling = TRUE library ", + "size will be used for eliminatig some sequencig depth effect ", + "before imputation"), .y)) # Log need_log = max(.x, na.rm=TRUE) > 50 @@ -3786,8 +4022,8 @@ fill_NA_using_formula = function(.data, .data_FIXED = .data_FIXED %>% - when( need_log ~ mutate(., !!.abundance := log1p(!!.abundance)), ~ (.) ) %>% - when( need_log & quo_is_symbol(.abundance_scaled) ~ mutate(., !!.abundance_scaled := log1p(!!.abundance_scaled)), ~ (.) ) %>% + when( need_log ~ mutate(., !!.abundance := log1p(!!.abundance)), ~ (.)) %>% + when( need_log & quo_is_symbol(.abundance_scaled) ~ mutate(., !!.abundance_scaled := log1p(!!.abundance_scaled)), ~ (.)) %>% # Group by covariate @@ -3815,11 +4051,14 @@ fill_NA_using_formula = function(.data, ) %>% # Through warning if group of size 1 - ifelse_pipe((.) %>% nrow() %>% `<` (2), warning("tidybulk says: According to your design matrix, u have sample groups of size < 2, so you your dataset could still be sparse.")) + ifelse_pipe((.) %>% nrow() %>% `<` (2), + warning("tidybulk says: According to your design matrix,", + " u have sample groups of size < 2, ", + "so you your dataset could still be sparse.")) )) %>% unnest(cov_data) %>% - when( need_log ~ mutate(., !!.abundance := exp(!!.abundance)-1), ~ (.) ) %>% - when( need_log & quo_is_symbol(.abundance_scaled) ~ mutate(., !!.abundance_scaled := exp(!!.abundance_scaled)-1), ~ (.) ) + when( need_log ~ mutate(., !!.abundance := exp(!!.abundance)-1), ~ (.)) %>% + when( need_log & quo_is_symbol(.abundance_scaled) ~ mutate(., !!.abundance_scaled := exp(!!.abundance_scaled)-1), ~ (.)) .data_OK %>% @@ -3835,7 +4074,8 @@ fill_NA_using_formula = function(.data, } -#' This function is needed for DE in case the matrix is not rectangular, but includes NA +#' This function is needed for DE in case the matrix is not rectangular, +#' but includes NA #' #' @keywords internal #' @noRd @@ -3896,11 +4136,15 @@ fill_NA_using_value = function(.data, ) # Select just features/covariates that have missing - combo_to_impute = df_to_impute %>% anti_join(.data, by=c(quo_names(.element), quo_names(.feature))) %>% select(!!.feature, !!.element) %>% distinct() + combo_to_impute = df_to_impute %>% + anti_join(.data, by=c(quo_names(.element), + quo_names(.feature))) %>% + select(!!.feature, !!.element) %>% distinct() # Impute using median df_to_impute %>% - inner_join(combo_to_impute, by = c(quo_names(.element), quo_names(.feature))) %>% + inner_join(combo_to_impute, by = c(quo_names(.element), + quo_names(.feature))) %>% # Fill mutate(!!.value := if_else(!!.value %>% is.na, fill_with, !!.value)) %>% @@ -3912,16 +4156,17 @@ fill_NA_using_value = function(.data, # In next command avoid error if no data to impute ifelse_pipe( nrow(.) > 0, - ~ .x %>% left_join(.data %>% pivot_sample(!!.element), by=quo_names(.element)) + ~ .x %>% left_join(.data %>% pivot_sample(!!.element), + by=quo_names(.element)) ) %>% # Add original dataset - bind_rows(.data %>% anti_join(combo_to_impute, by=c(quo_names(.feature), quo_names(.element)))) %>% + bind_rows(.data %>% anti_join(combo_to_impute, by=c(quo_names(.feature), + quo_names(.element)))) %>% select(.data %>% colnames) %>% # Reattach internals reattach_internals(.data) - } @@ -4002,7 +4247,7 @@ fill_NA_using_value = function(.data, #' @noRd #' #' @importFrom stats p.adjust -entrez_over_to_gsea = function(my_entrez_rank, species, gene_collections = NULL){ +entrez_over_to_gsea <- function(my_entrez_rank, species, gene_collections= NULL){ # From the page # https://yulab-smu.github.io/clusterProfiler-book/chapter5.html @@ -4065,7 +4310,7 @@ entrez_over_to_gsea = function(my_entrez_rank, species, gene_collections = NULL #' @importFrom stats p.adjust #' @importFrom purrr map #' -entrez_rank_to_gsea = function(my_entrez_rank, species, gene_collections = NULL){ +entrez_rank_to_gsea <- function(my_entrez_rank, species, gene_collections= NULL){ # From the page # https://yulab-smu.github.io/clusterProfiler-book/chapter5.html @@ -4095,12 +4340,14 @@ entrez_rank_to_gsea = function(my_entrez_rank, species, gene_collections = NULL if(is.null(gene_collections ) ) my_gene_collection = msigdbr::msigdbr(species = species) else if(gene_collections |> is("character")) - my_gene_collection = msigdbr::msigdbr(species = species) %>% filter( tolower(gs_cat) %in% tolower(gene_collections) ) + my_gene_collection = msigdbr::msigdbr(species = species) %>% + filter( tolower(gs_cat) %in% tolower(gene_collections) ) else if(gene_collections |> is("list")) - my_gene_collection = tibble(gs_name=names(.), entrez_gene = . ) %>% unnest(entrez_gene) %>% mutate(gs_cat = "user_defined") + my_gene_collection = tibble(gs_name=names(.), entrez_gene = . ) %>% + unnest(entrez_gene) %>% mutate(gs_cat = "user_defined") else - stop("tidybulk says: the gene sets should be either a character vector or a named list") - + stop("tidybulk says: the gene sets should be either a character ", + "vector or a named list") my_gene_collection |> diff --git a/R/functions_SE.R b/R/functions_SE.R index ca94af1f..14b177e4 100755 --- a/R/functions_SE.R +++ b/R/functions_SE.R @@ -10,11 +10,15 @@ #' @importFrom rlang := #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes) +#' @param .abundance A column symbol with the value the clustering is based on +#' (e.g., `count`) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally samples) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally genes) #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param ... Further parameters passed to the function kmeans #' #' @return A tibble with additional columns @@ -62,11 +66,15 @@ get_clusters_kmeans_bulk_SE <- #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally samples) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally genes) +#' @param .abundance A column symbol with the value the clustering is based on +#' (e.g., `count`) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally samples) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally genes) #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param ... Further parameters passed to the function kmeans #' #' @return A tibble with additional columns @@ -121,13 +129,18 @@ get_clusters_SNN_bulk_SE <- #' @importFrom stats setNames #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering is based +#' on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it +#' is log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param scale A boolean #' #' @return A tibble with additional columns @@ -216,13 +229,19 @@ get_reduced_dimensions_MDS_bulk_SE <- #' @importFrom magrittr divide_by #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering is +#' based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities +#' to cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it +#' is log1p for RNA sequencing data, but for avoinding tranformation you can +#' use identity #' @param scale A boolean #' @param ... Further parameters passed to the function prcomp #' @@ -317,13 +336,18 @@ we suggest to partition the dataset for sample clusters. #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering is +#' based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' @param scale A boolean #' @param ... Further parameters passed to the function Rtsne #' @@ -360,10 +384,11 @@ get_reduced_dimensions_TSNE_bulk_SE <- } # Set perprexity to not be too high - if (!"perplexity" %in% names(arguments)) - arguments = arguments %>% c(perplexity = (( - .data %>% ncol() %>% sum(-1) - ) / 3 / 2) %>% floor() %>% min(30)) + if (!"perplexity" %in% names(arguments)) { + arguments <- arguments %>% c(perplexity = (( + .data %>% distinct(!!.element) %>% nrow() %>% sum(-1) + ) / 3 / 2) %>% floor(.) %>% min(30)) + } # If not enough samples stop if (arguments$perplexity <= 2) @@ -400,15 +425,22 @@ get_reduced_dimensions_TSNE_bulk_SE <- #' @importFrom utils install.packages #' #' @param .data A tibble -#' @param .abundance A column symbol with the value the clustering is based on (e.g., `count`) -#' @param .dims A integer vector corresponding to principal components of interest (e.g., 1:6) -#' @param .feature A column symbol. The column that is represents entities to cluster (i.e., normally genes) -#' @param .element A column symbol. The column that is used to calculate distance (i.e., normally samples) +#' @param .abundance A column symbol with the value the clustering is +#' based on (e.g., `count`) +#' @param .dims A integer vector corresponding to principal components of +#' interest (e.g., 1:6) +#' @param .feature A column symbol. The column that is represents entities to +#' cluster (i.e., normally genes) +#' @param .element A column symbol. The column that is used to calculate +#' distance (i.e., normally samples) #' @param top An integer. How many top genes to select #' @param of_samples A boolean -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity -#' @param calculate_for_pca_dimensions An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered -#' @param ... Further parameters passed to the function uwot::tumap +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param calculate_for_pca_dimensions An integer of length one. The number of +#' PCA dimensions to based the UMAP calculatio on. If NULL all variable +#' features are considered +#' @param ... Further parameters passed to the function uwot #' #' @return A tibble with additional columns #' @@ -512,7 +544,10 @@ filter_if_abundant_were_identified = function(.data){ when( ".abundant" %in% (rowData(.data) %>% colnames()) ~ .data[rowData(.data)[,".abundant"],], ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") + warning("tidybulk says: highly abundant transcripts were not identified ", + "(i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore ", + "this operation will be performed on unfiltered data. In rare occasions ", + "this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") (.) } ) @@ -528,7 +563,8 @@ filter_if_abundant_were_identified = function(.data){ #' @param .transcript A character name of the transcript/gene column #' @param .abundance A character name of the read count column #' @param top An integer. How many top genes to select -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use identity #' #' @return A tibble filtered genes #' @@ -559,7 +595,8 @@ keep_variable_transcripts_SE = function(.data, } -#' Drop redundant elements (e.g., samples) for which feature (e.g., genes) aboundances are correlated +#' Drop redundant elements (e.g., samples) for which feature (e.g., genes) +#' aboundances are correlated #' #' @keywords internal #' @noRd @@ -578,8 +615,8 @@ keep_variable_transcripts_SE = function(.data, #' #' remove_redundancy_elements_through_correlation_SE <- function(.data, - correlation_threshold = 0.9, - of_samples = TRUE) { + correlation_threshold = 0.9, + of_samples = TRUE) { # Comply with CRAN NOTES . = NULL @@ -705,26 +742,36 @@ remove_redundancy_elements_though_reduced_dimensions_SE <- #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param test_above_log2_fold_change A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param .formula a formula with no response variable, referring only to +#' numeric variables +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., +#' QLF), "edgeR_likelihood_ratio" (i.e., LRT) +#' @param test_above_log2_fold_change A positive real value. This works for +#' edgeR and limma_voom methods. It uses the `treat` function, which tests that +#' the difference in abundance is bigger than this threshold rather than zero +#' \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE", +#' "upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' #' @return A tibble with edgeR results #' get_differential_transcript_abundance_bulk_SE <- function(.data, - .formula, - .abundance = NULL, - sample_annotation, - .contrasts = NULL, - method = "edgeR_quasi_likelihood", - test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "", - ...) { + .formula, + .abundance = NULL, + sample_annotation, + .contrasts = NULL, + method = "edgeR_quasi_likelihood", + test_above_log2_fold_change = NULL, + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "", + ...) { .abundance = enquo(.abundance) @@ -743,8 +790,9 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # Replace `:` with ___ because it creates error with edgeR if(design |> colnames() |> str_detect(":") |> any()) { - message("tidybulk says: the interaction term `:` has been replaced with `___` in the design matrix, in order to work with edgeR.") - colnames(design) = design |> colnames() |> str_replace(":", "___") + message("tidybulk says: the interaction term `:` has been replaced ", + "with `___` in the design matrix, in order to work with edgeR.") + colnames(design) = design |> colnames() |> str_replace(":", "___") } # Print the design column names in case I want contrasts @@ -791,9 +839,12 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # select method when( - tolower(method) == "edger_likelihood_ratio" ~ (.) %>% edgeR::estimateDisp(design) %>% edgeR::glmFit(design), - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::estimateDisp(design) %>% edgeR::glmQLFit(design), - tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% edgeR::estimateGLMRobustDisp(design) %>% edgeR::glmFit(design) + tolower(method) == "edger_likelihood_ratio" ~ (.) %>% + edgeR::estimateDisp(design) %>% edgeR::glmFit(design), + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::estimateDisp(design) %>% edgeR::glmQLFit(design), + tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% + edgeR::estimateGLMRobustDisp(design) %>% edgeR::glmFit(design) ) # Return @@ -811,9 +862,14 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # select method when( - !is.null(test_above_log2_fold_change) ~ (.) %>% edgeR::glmTreat(coef = 2, contrast = my_contrasts, lfc=test_above_log2_fold_change), - tolower(method) %in% c("edger_likelihood_ratio", "edger_robust_likelihood_ratio") ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts) , - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts) + !is.null(test_above_log2_fold_change) ~ (.) %>% + edgeR::glmTreat(coef = 2, contrast = my_contrasts, + lfc=test_above_log2_fold_change), + tolower(method) %in% c("edger_likelihood_ratio", + "edger_robust_likelihood_ratio") ~ (.) %>% + edgeR::glmLRT(coef = 2, contrast = my_contrasts) , + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::glmQLFTest(coef = 2, contrast = my_contrasts) ) %>% # Convert to tibble @@ -837,9 +893,14 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, # select method when( - !is.null(test_above_log2_fold_change) ~ (.) %>% edgeR::glmTreat(coef = 2, contrast = my_contrasts[, .x], lfc=test_above_log2_fold_change), - tolower(method) %in% c("edger_likelihood_ratio", "edger_robust_likelihood_ratio") ~ (.) %>% edgeR::glmLRT(coef = 2, contrast = my_contrasts[, .x]) , - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% edgeR::glmQLFTest(coef = 2, contrast = my_contrasts[, .x]) + !is.null(test_above_log2_fold_change) ~ (.) %>% + edgeR::glmTreat(coef = 2, contrast = my_contrasts[, .x], + lfc=test_above_log2_fold_change), + tolower(method) %in% c("edger_likelihood_ratio", + "edger_robust_likelihood_ratio") ~ (.) %>% + edgeR::glmLRT(coef = 2, contrast = my_contrasts[, .x]) , + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + edgeR::glmQLFTest(coef = 2, contrast = my_contrasts[, .x]) ) %>% # Convert to tibble @@ -863,10 +924,6 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, sprintf("%s%s", prefix, colnames(.)[2:ncol(.)]) )) ) - - - - } @@ -875,8 +932,6 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -885,24 +940,29 @@ get_differential_transcript_abundance_bulk_SE <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables -#' @param .contrasts A character vector. See voom makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param .formula a formula with no response variable, +#' referring only to numeric variables +#' @param .contrasts A character vector. See voom makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) #' @param method A string character. Either "limma_voom", "limma_voom_sample_weights" -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' #' @return A tibble with voom results #' get_differential_transcript_abundance_bulk_voom_SE <- function(.data, - .formula, - .abundance = NULL, - sample_annotation, - .contrasts = NULL, - method = NULL, - test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "") { + .formula, + .abundance = NULL, + sample_annotation, + .contrasts = NULL, + method = NULL, + test_above_log2_fold_change = NULL, + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "") { .abundance = enquo(.abundance) @@ -966,7 +1026,8 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, # select method when( tolower(method) == "limma_voom" ~ (.) %>% limma::voom(design, plot=FALSE), - tolower(method) == "limma_voom_sample_weights" ~ (.) %>% limma::voomWithQualityWeights(design, plot=FALSE) + tolower(method) == "limma_voom_sample_weights" ~ (.) %>% + limma::voomWithQualityWeights(design, plot=FALSE) ) %>% limma::lmFit(design) @@ -985,7 +1046,8 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, ~ .x %>% # Contrasts - limma::contrasts.fit(contrasts=my_contrasts, coefficients = when(my_contrasts, is.null(.) ~ 2)) %>% + limma::contrasts.fit(contrasts=my_contrasts, + coefficients = when(my_contrasts, is.null(.) ~ 2)) %>% limma::eBayes() %>% when( @@ -1066,25 +1128,30 @@ get_differential_transcript_abundance_bulk_voom_SE <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param .formula a formula with no response variable, referring only to +#' numeric variables +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +#' "edgeR_likelihood_ratio" (i.e., LRT) +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") #' @param .scaling_factor A tidyeval (column name) for the precalculated TMM scaling -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' @param ... Additional arguments for glmmSeq #' #' @return A tibble with glmmSeq results #' get_differential_transcript_abundance_glmmSeq_SE <- function(.data, - .formula, - .abundance = NULL, - .contrasts = NULL, - sample_annotation , - method, - - test_above_log2_fold_change = NULL, + .formula, + .abundance = NULL, + .contrasts = NULL, + sample_annotation , + method, + test_above_log2_fold_change = NULL, scaling_method = "TMM", .scaling_factor = NULL, omit_contrast_in_colnames = FALSE, @@ -1101,7 +1168,9 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, .contrasts %>% is.null %>% not() & .contrasts %>% class %>% equals("list") %>% not() ) - stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") + stop("tidybulk says: for DESeq2 the list of constrasts should be given in ", + "the form list(c(\"condition_column\",\"condition1\",\"condition2\")) ", + "i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ @@ -1149,10 +1218,12 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, data = metadata ) - if(.dispersion |> quo_is_symbolic()) - dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> as_tibble(rownames = feature__$name) |> deframe() + if(quo_is_symbolic(.dispersion)) + dispersion = rowData(.data)[,quo_name(.dispersion),drop=FALSE] |> + as_tibble(rownames = feature__$name) |> deframe() else - dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> setNames(rownames(counts)) + dispersion = counts |> edgeR::estimateDisp(design = design) %$% tagwise.dispersion |> + setNames(rownames(counts)) # # Check dispersion # if(!names(dispersion) |> sort() |> identical( @@ -1184,7 +1255,8 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, glmmSeq_object |> summary_lmmSeq() |> as_tibble(rownames = "transcript") |> - mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), .names = "{.col}_{.fn}")) |> + mutate(across(starts_with("P_"), list(adjusted = function(x) p.adjust(x, method="BH")), + .names = "{.col}_{.fn}")) |> # Attach attributes reattach_internals(.data) %>% @@ -1220,8 +1292,6 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, #' @keywords internal #' @noRd #' -#' -#' #' @import tibble #' @importFrom magrittr set_colnames #' @importFrom stats model.matrix @@ -1230,27 +1300,33 @@ get_differential_transcript_abundance_glmmSeq_SE <- function(.data, #' #' #' @param .data A tibble -#' @param .formula a formula with no response variable, referring only to numeric variables -#' @param .contrasts A character vector. See edgeR makeContrasts specification for the parameter `contrasts`. If contrasts are not present the first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT) -#' @param scaling_method A character string. The scaling method passed to the backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. +#' @param .formula a formula with no response variable, referring only to +#' numeric variables +#' @param .contrasts A character vector. See edgeR makeContrasts specification +#' for the parameter `contrasts`. If contrasts are not present the first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +#' "edgeR_likelihood_ratio" (i.e., LRT) +#' @param scaling_method A character string. The scaling method passed to the +#' backend function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. #' @param ... Additional arguments for DESeq2 #' #' @return A tibble with DESeq2 results #' get_differential_transcript_abundance_deseq2_SE <- function(.data, - .formula, - .abundance = NULL, - .contrasts = NULL, - method = "deseq2", + .formula, + .abundance = NULL, + .contrasts = NULL, + method = "deseq2", - test_above_log2_fold_change = NULL, + test_above_log2_fold_change = NULL, - scaling_method = "TMM", - omit_contrast_in_colnames = FALSE, - prefix = "", - ...) { + scaling_method = "TMM", + omit_contrast_in_colnames = FALSE, + prefix = "", + ...) { .abundance = enquo(.abundance) @@ -1259,7 +1335,9 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, .contrasts %>% is.null %>% not() & .contrasts %>% class %>% equals("list") %>% not() ) - stop("tidybulk says: for DESeq2 the list of constrasts should be given in the form list(c(\"condition_column\",\"condition1\",\"condition2\")) i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") + stop("tidybulk says: for DESeq2 the list of constrasts should be given in ", + "the form list(c(\"condition_column\",\"condition1\",\"condition2\")) ", + "i.e. list(c(\"genotype\",\"knockout\",\"wildtype\"))") # Check if omit_contrast_in_colnames is correctly setup if(omit_contrast_in_colnames & length(.contrasts) > 1){ @@ -1323,15 +1401,18 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, (.) %>% DESeq2::results(contrast = c( parse_formula(.formula)[1], - deseq2_object@colData[,parse_formula(.formula)[1]] %>% as.factor() %>% levels %>% .[2], - deseq2_object@colData[,parse_formula(.formula)[1]] %>% as.factor() %>% levels %>% .[1] + deseq2_object@colData[,parse_formula(.formula)[1]] %>% + as.factor() %>% levels %>% .[2], + deseq2_object@colData[,parse_formula(.formula)[1]] %>% + as.factor() %>% levels %>% .[1] ), lfcThreshold=test_above_log2_fold_change) %>% as_tibble(rownames = "transcript"), # Simple comparison discrete my_contrasts %>% is.null %>% not() & omit_contrast_in_colnames ~ (.) %>% - DESeq2::results(contrast = my_contrasts[[1]], lfcThreshold=test_above_log2_fold_change)%>% + DESeq2::results(contrast = my_contrasts[[1]], + lfcThreshold=test_above_log2_fold_change)%>% as_tibble(rownames = "transcript"), # Multiple comparisons NOT USED AT THE MOMENT @@ -1343,11 +1424,13 @@ get_differential_transcript_abundance_deseq2_SE <- function(.data, ~ deseq2_obj %>% # select method - DESeq2::results(contrast = my_contrasts[[.x]], lfcThreshold=test_above_log2_fold_change) %>% + DESeq2::results(contrast = my_contrasts[[.x]], + lfcThreshold=test_above_log2_fold_change) %>% # Convert to tibble as_tibble(rownames = "transcript") %>% - mutate(constrast = sprintf("%s %s-%s", my_contrasts[[.x]][1], my_contrasts[[.x]][2], my_contrasts[[.x]][3]) ) + mutate(constrast = sprintf("%s %s-%s", my_contrasts[[.x]][1], + my_contrasts[[.x]][2], my_contrasts[[.x]][3]) ) ) %>% pivot_wider(values_from = -c(transcript, constrast), diff --git a/R/ggplot.R b/R/ggplot.R index 489c3f62..1a020c58 100644 --- a/R/ggplot.R +++ b/R/ggplot.R @@ -34,8 +34,9 @@ log10_reverse_trans <- function() { #' @description it perform logit scaling with right axis formatting. To not be used directly but with ggplot (e.g. scale_y_continuous(trans = "log10_reverse") ) #' #' @importFrom scales label_scientific +#' @importFrom scales trans_new #' @importFrom scales extended_breaks -#' @importFrom stats qlogis plogis +#' @importFrom functional Compose #' #' @return A scales object #' @@ -52,12 +53,6 @@ log10_reverse_trans <- function() { #' @export logit_trans <- function(){ - - if (find.package("functional", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing functional needed for analyses") - install.packages("functional", repos = "https://cloud.r-project.org") - } - trans <- qlogis inv <- plogis diff --git a/R/glmmSeq.R b/R/glmmSeq.R index 9eb0b9db..b0d267e9 100644 --- a/R/glmmSeq.R +++ b/R/glmmSeq.R @@ -164,7 +164,9 @@ glmmTMB_standard_error = function (model){ #' @importFrom purrr map2_dfc #' @importFrom tidyr pivot_longer #' @importFrom tidyr pivot_wider +#' @importFrom dplyr left_join #' @importFrom dplyr join_by +#' @importFrom dplyr select #' #' @keywords internal #' @noRd @@ -181,7 +183,7 @@ glmmTMB_to_confidence_intervals_random_effects = function(fit){ "group_id" |> c(sprintf("%s__%s", .y, colnames(.x))) ) |> - pivot_longer(-group_id, names_to = "parameter", values_to = "CI") + tidyr::pivot_longer(-group_id, names_to = "parameter", values_to = "CI") ) mod = glmmTMB::ranef(fit, condVar=T)$cond @@ -193,15 +195,15 @@ glmmTMB_to_confidence_intervals_random_effects = function(fit){ "group_id" |> c(sprintf("%s__%s", .y, colnames(.x))) ) |> - pivot_longer(-group_id, names_to = "parameter", values_to = "mode") + tidyr::pivot_longer(-group_id, names_to = "parameter", values_to = "mode") ) mod |> - left_join(ster, join_by(group_id, parameter)) |> - mutate(lower = mode - CI, upper = mode + CI) |> - select(-CI) |> + dplyr::left_join(ster, dplyr::join_by(group_id, parameter)) |> + dplyr::mutate(lower = mode - CI, upper = mode + CI) |> + dplyr::select(-CI) |> tidyr::unite("parameter", c(group_id, parameter), sep="_") |> - pivot_wider(names_from = parameter, values_from = c(lower, mode, upper), names_glue = "{parameter}__{.value}") + tidyr::pivot_wider(names_from = parameter, values_from = c(lower, mode, upper), names_glue = "{parameter}__{.value}") } @@ -420,6 +422,13 @@ setClassUnion("formulaOrNULL", c("formula", "NULL")) #' #' @keywords internal #' @noRd +#' +#' @import lme4 +#' @importFrom glmmTMB glmmTMBControl +#' @importFrom parallel makeCluster +#' @importFrom parallel clusterExport +#' @importFrom pbapply pblapply +#' @importFrom pbmcapply pbmclapply #' #' @slot info List including the matched call, dispersions, offset, designMatrix #' @slot formula The model formula @@ -580,14 +589,6 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N clusterExport(cl, varlist = varlist, envir = environment()) if (progress) { - # Check if package is installed, otherwise install - if (find.package("pblapply", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing pblapply needed for differential transcript abundance analyses") - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") - BiocManager::install("pblapply", ask = FALSE) - } - resultList <- pbapply::pblapply(fullList, function(geneList) { args <- c(list(geneList = geneList, fullFormula = fullFormula, reduced = reduced, data = subsetMetadata, @@ -617,6 +618,7 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N } } else { + if(avoid_forking){ #library(parallel) cl = parallel::makeCluster(cores, type = "PSOCK") @@ -642,6 +644,7 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N BiocManager::install("pbmcapply", ask = FALSE) } + resultList <- pbmcapply::pbmclapply(fullList, function(geneList) { glmerCore(geneList, fullFormula, reduced, subsetMetadata, control, offset, modelData, @@ -687,14 +690,6 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N parallel::clusterExport(cl, varlist = varlist, envir = environment()) if (progress) { - # Check if package is installed, otherwise install - if (find.package("pblapply", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing pblapply needed for differential transcript abundance analyses") - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") - BiocManager::install("pblapply", ask = FALSE) - } - resultList <- pbapply::pblapply(fullList, function(geneList) { args <- c(list(geneList = geneList, fullFormula = fullFormula, reduced = reduced, data = subsetMetadata, @@ -720,14 +715,6 @@ glmmSeq = function (modelFormula, countdata, metadata, id = NULL, dispersion = N else { if (progress) { - # Check if package is installed, otherwise install - if (find.package("pbmcapply", quiet = TRUE) %>% length %>% equals(0)) { - message("tidybulk says: Installing pbmcapply needed for differential transcript abundance analyses") - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") - BiocManager::install("pbmcapply", ask = FALSE) - } - resultList <- pbmcapply::pbmclapply(fullList, function(geneList) { glmmTMBcore(geneList, fullFormula, reduced, subsetMetadata, family, control, offset, diff --git a/R/methods.R b/R/methods.R index 4dab28a3..7508d888 100755 --- a/R/methods.R +++ b/R/methods.R @@ -1,30 +1,37 @@ setOldClass("tidybulk") -#' Creates an annotated `tidybulk` tibble from a `tbl` or `SummarizedExperiment` object +#' Creates an annotated `tidybulk` tibble from a `tbl` or +#' `SummarizedExperiment` object #' #' `r lifecycle::badge("maturing")` #' -#' @description tidybulk() creates an annotated `tidybulk` tibble from a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @description tidybulk() creates an annotated `tidybulk` tibble from a +#' `tbl` (with at least three columns for sample, feature and transcript +#' abundance) or `SummarizedExperiment` (more convenient if abstracted to +#' tibble with library(tidySummarizedExperiment)) #' #' @importFrom rlang enquo #' @importFrom rlang quo_is_missing -#' #' @import readr #' @import SummarizedExperiment #' @import methods #' #' @name tidybulk #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param .abundance_scaled The name of the transcript/gene scaled abundance column +#' @param .abundance_scaled The name of the transcript/gene scaled +#' abundance column #' #' @details This function creates a tidybulk object and is useful if you want #' to avoid to specify .sample, .transcript and .abundance arguments all the times. #' The tidybulk object have an attribute called internals where these three -#' arguments are stored as metadata. They can be extracted as attr(, "internals"). +#' arguments are stored as metadata. They can be extracted as +#' attr(, "internals"). #' #' @return A `tidybulk` object #' @@ -40,18 +47,18 @@ setOldClass("tidybulk") #' @export #' setGeneric("tidybulk", function(.data, - .sample, - .transcript, - .abundance, - .abundance_scaled = NULL) - standardGeneric("tidybulk")) + .sample, + .transcript, + .abundance, + .abundance_scaled = NULL) + standardGeneric("tidybulk")) # Set internal -.tidybulk = function(.data, - .sample, - .transcript, - .abundance, - .abundance_scaled = NULL) { +.tidybulk <- function(.data, + .sample, + .transcript, + .abundance, + .abundance_scaled = NULL) { # Make col names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -62,7 +69,8 @@ setGeneric("tidybulk", function(.data, quo_is_missing(.sample) | quo_is_missing(.transcript) | quo_is_missing(.abundance) - ) stop("tidybulk says: the arguments .sample, .transcript and .abundance must include column names (not surrounded by quotes)") + ) stop("tidybulk says: the arguments .sample, .transcript and .abundance ", + "must include column names (not surrounded by quotes)") # Validate data frame if(do_validate()) validation(.data, @@ -81,8 +89,6 @@ setGeneric("tidybulk", function(.data, #' #' @export #' -#' @inheritParams tidybulk -#' #' @docType methods #' @rdname tidybulk-methods #' @@ -96,8 +102,6 @@ setMethod("tidybulk", "spec_tbl_df", .tidybulk) #' #' @importFrom purrr map2 #' -#' @inheritParams tidybulk -#' #' @docType methods #' @rdname tidybulk-methods #' @@ -108,7 +112,9 @@ setMethod("tidybulk", "tbl_df", .tidybulk) #' as_SummarizedExperiment #' -#' @description as_SummarizedExperiment() creates a `SummarizedExperiment` object from a `tbl` or `tidybulk` tbl formatted as | | | | <...> | +#' @description as_SummarizedExperiment() creates a `SummarizedExperiment` +#' object from a `tbl` or `tidybulk` tbl formatted as | | +#' | | <...> | #' #' #' @importFrom utils data @@ -126,16 +132,16 @@ setMethod("tidybulk", "tbl_df", .tidybulk) #' @export #' setGeneric("as_SummarizedExperiment", function(.data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL) + .sample = NULL, + .transcript = NULL, + .abundance = NULL) standardGeneric("as_SummarizedExperiment")) -.as_SummarizedExperiment = function(.data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL) { +.as_SummarizedExperiment <- function(.data, + .sample = NULL, + .transcript = NULL, + .abundance = NULL) { # Fix NOTEs . = NULL @@ -144,7 +150,8 @@ setGeneric("as_SummarizedExperiment", function(.data, .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) - col_names = get_sample_transcript_counts(.data, .sample, .transcript, .abundance) + col_names = get_sample_transcript_counts(.data, .sample, + .transcript, .abundance) .sample = col_names$.sample .transcript = col_names$.transcript .abundance = col_names$.abundance @@ -230,7 +237,8 @@ setGeneric("as_SummarizedExperiment", function(.data, counts_cols) %>% distinct() %>% - pivot_longer( cols=-c(!!feature__$symbol,!!sample__$symbol), names_to="assay", values_to= ".a") %>% + pivot_longer( cols=-c(!!feature__$symbol,!!sample__$symbol), + names_to="assay", values_to= ".a") %>% nest(`data` = -`assay`) %>% mutate(`data` = `data` %>% map( ~ .x %>% @@ -259,8 +267,6 @@ setGeneric("as_SummarizedExperiment", function(.data, #' #' @export #' -#' @inheritParams as_SummarizedExperiment -#' #' @docType methods #' @rdname as_SummarizedExperiment-methods #' @@ -272,8 +278,6 @@ setMethod("as_SummarizedExperiment", "spec_tbl_df", .as_SummarizedExperiment) #' #' @export #' -#' @inheritParams as_SummarizedExperiment -#' #' @docType methods #' @rdname as_SummarizedExperiment-methods #' @@ -285,8 +289,6 @@ setMethod("as_SummarizedExperiment", "tbl_df", .as_SummarizedExperiment) #' #' @export #' -#' @inheritParams as_SummarizedExperiment -#' #' @docType methods #' @rdname as_SummarizedExperiment-methods #' @@ -299,21 +301,27 @@ setMethod("as_SummarizedExperiment", "tidybulk", .as_SummarizedExperiment) #' #' `r lifecycle::badge("maturing")` #' -#' @description tidybulk_SAM_BAM() creates a `tt` object from A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @description tidybulk_SAM_BAM() creates a `tt` object from A `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) #' #' @importFrom rlang enquo #' -#' #' @name tidybulk_SAM_BAM #' #' @param file_names A character vector -#' @param genome A character string specifying an in-built annotation used for read summarization. It has four possible values including "mm10", "mm9", "hg38" and "hg19" +#' @param genome A character string specifying an in-built annotation used for +#' read summarization. It has four possible values including "mm10", "mm9", +#' "hg38" and "hg19" #' @param ... Further parameters passed to the function Rsubread::featureCounts #' -#' @details This function is based on FeatureCounts package (DOI: 10.1093/bioinformatics/btt656). This function creates a tidybulk object and is useful if you want -#' to avoid to specify .sample, .transcript and .abundance arguments all the times. -#' The tidybulk object have an attribute called internals where these three -#' arguments are stored as metadata. They can be extracted as attr(, "internals"). +#' @details This function is based on FeatureCounts package +#' (DOI: 10.1093/bioinformatics/btt656). This function creates a tidybulk +#' object and is useful if you want to avoid to specify .sample, +#' .transcript and .abundance arguments all the times. The tidybulk object +#' have an attribute called internals where these three arguments are stored +#' as metadata. They can be extracted as attr(, "internals"). #' #' Underlying core function #' Rsubread::featureCounts(annot.inbuilt = genome,nthreads = n_cores, ...) @@ -335,62 +343,68 @@ setGeneric("tidybulk_SAM_BAM", function(file_names, genome = "hg38", ...) #' #' @export #' -#' @inheritParams tidybulk_SAM_BAM-methods -#' #' @docType methods #' @rdname tidybulk_SAM_BAM-methods #' #' @return A `tidybulk` object #' -setMethod("tidybulk_SAM_BAM", c(file_names = "character", genome = "character"), function(file_names, genome = "hg38", ...) +setMethod("tidybulk_SAM_BAM", c(file_names = "character", genome = "character"), + function(file_names, genome = "hg38", ...) create_tt_from_bam_sam_bulk(file_names = file_names, genome = genome, ...)) #' Scale the counts of transcripts/genes #' #' `r lifecycle::badge("maturing")` #' -#' @description scale_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +#' @description scale_abundance() takes as input A `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and Scales transcript abundance +#' compansating for sequencing depth (e.g., with TMM algorithm, +#' Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). #' #' @importFrom rlang enquo -#' #' @importFrom stats median #' #' @name scale_abundance #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, +#' feature and transcript abundance) or `SummarizedExperiment` +#' (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A character string. The scaling method passed to the back-end function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") -#' @param reference_sample A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference. -#' @param .subset_for_scaling A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example -#' @param action A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information. +#' @param method A character string. The scaling method passed to the back-end +#' function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile") +#' @param reference_sample A character string. The name of the reference sample. +#' If NULL the sample with highest total read count will be selected as reference. +#' @param .subset_for_scaling A gene-wise quosure condition. This will be used +#' to filter rows (features/genes) of the dataset. For example +#' @param action A character string between "add" (default) and "only". +#' "add" joins the new information to the input tbl (default), "only" +#' return a non-redundant tbl with the just new information. #' #' @param reference_selection_function DEPRECATED. please use reference_sample. #' #' @details Scales transcript abundance compensating for sequencing depth #' (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). -#' Lowly transcribed transcripts/genes (defined with minimum_counts and minimum_proportion parameters) -#' are filtered out from the scaling procedure. +#' Lowly transcribed transcripts/genes (defined with minimum_counts +#' and minimum_proportion parameters) are filtered out from the scaling procedure. #' The scaling inference is then applied back to all unfiltered data. #' #' Underlying method #' edgeR::calcNormFactors(.data, method = c("TMM","TMMwsp","RLE","upperquartile")) #' #' -#' -#' @return A tbl object with additional columns with scaled data as `_scaled` -#' +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' #' @examples -#' -#' #' tidybulk::se_mini |> #' identify_abundant() |> #' scale_abundance() #' -#' -#' #' @docType methods #' @rdname scale_abundance-methods #' @export @@ -403,13 +417,12 @@ setGeneric("scale_abundance", function(.data, reference_sample = NULL, .subset_for_scaling = NULL, action = "add", - # DEPRECATED reference_selection_function = NULL) standardGeneric("scale_abundance")) # Set internal -.scale_abundance = function(.data, +.scale_abundance <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, @@ -417,10 +430,8 @@ setGeneric("scale_abundance", function(.data, reference_sample = NULL, .subset_for_scaling = NULL, action = "add", - # DEPRECATED - reference_selection_function = NULL) -{ + reference_selection_function = NULL) { # Fix NOTEs . = NULL @@ -429,7 +440,8 @@ setGeneric("scale_abundance", function(.data, .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) - col_names = get_sample_transcript_counts(.data, .sample, .transcript, .abundance) + col_names = get_sample_transcript_counts(.data, .sample, + .transcript, .abundance) .sample = col_names$.sample .transcript = col_names$.transcript .abundance = col_names$.abundance @@ -441,9 +453,9 @@ setGeneric("scale_abundance", function(.data, # DEPRECATION OF reference function if (is_present(reference_selection_function) & !is.null(reference_selection_function)) { - # Signal the deprecation to the user - deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()") + deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", + details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()") } @@ -464,7 +476,11 @@ setGeneric("scale_abundance", function(.data, when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") + warning("tidybulk says: highly abundant transcripts were not identified", + " (i.e. identify_abundant()) or filtered (i.e., keep_abundant), ", + "therefore this operation will be performed on unfiltered data. ", + "In rare occasions this could be wanted. In standard whole-transcriptome ", + "workflows is generally unwanted.") (.) } ) %>% @@ -476,7 +492,9 @@ setGeneric("scale_abundance", function(.data, ) %>% # Check I have genes left - when(nrow(.) == 0 ~ stop("tidybulk says: there are 0 genes that passes the filters (.abundant and/or .subset_for_scaling). Please check your filtering or your data."), ~ (.)) %>% + when(nrow(.) == 0 ~ stop("tidybulk says: there are 0 genes that passes ", + "the filters (.abundant and/or .subset_for_scaling). ", + "Please check your filtering or your data."), ~ (.)) %>% get_scaled_counts_bulk( .sample = !!.sample, @@ -523,7 +541,8 @@ setGeneric("scale_abundance", function(.data, else if (action == "only") .data_norm else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } @@ -531,26 +550,22 @@ setGeneric("scale_abundance", function(.data, #' scale_abundance #' #' @export -#' -#' @inheritParams scale_abundance -#' #' @docType methods #' @rdname scale_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' setMethod("scale_abundance", "spec_tbl_df", .scale_abundance) #' scale_abundance #' #' @export -#' -#' @inheritParams scale_abundance -#' #' @docType methods #' @rdname scale_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' setMethod("scale_abundance", "tbl_df", .scale_abundance) @@ -558,12 +573,11 @@ setMethod("scale_abundance", "tbl_df", .scale_abundance) #' #' @export #' -#' @inheritParams scale_abundance -#' #' @docType methods #' @rdname scale_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' setMethod("scale_abundance", "tidybulk", .scale_abundance) @@ -573,25 +587,44 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' #' `r lifecycle::badge("maturing")` #' -#' @description quantile_normalise_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +#' @description quantile_normalise_abundance() takes as input A `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and Scales transcript abundance +#' compansating for sequencing depth (e.g., with TMM algorithm, Robinson +#' and Oshlack doi.org/10.1186/gb-2010-11-3-r25). #' #' @importFrom rlang enquo -#' #' @importFrom stats median #' @importFrom dplyr join_by #' #' @name quantile_normalise_abundance #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, +#' feature and transcript abundance) or `SummarizedExperiment` +#' (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A character string. Either "limma_normalize_quantiles" for limma::normalizeQuantiles or "preprocesscore_normalize_quantiles_use_target" for preprocessCore::normalize.quantiles.use.target for large-scale datasets. -#' @param target_distribution A numeric vector. If NULL the target distribution will be calculated by preprocessCore. This argument only affects the "preprocesscore_normalize_quantiles_use_target" method. -#' @param action A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information. +#' @param method A character string. Either "limma_normalize_quantiles" +#' for limma::normalizeQuantiles or +#' "preprocesscore_normalize_quantiles_use_target" for +#' preprocessCore::normalize.quantiles.use.target for large-scale dataset, +#' where limmma could not be compatible. +#' @param action A character string between "add" (default) and "only". +#' "add" joins the new information to the input tbl (default), +#' "only" return a non-redundant tbl with the just new information. +#' @param target_distribution A numeric vector. If NULL the target distribution +#' will be calculated by preprocessCore. This argument only affects the +#' "preprocesscore_normalize_quantiles_use_target" method. #' #' -#' @details Tranform the feature abundance across samples so to have the same quantile distribution (using preprocessCore). +#' @details Scales transcript abundance compensating for sequencing depth +#' (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +#' Lowly transcribed transcripts/genes (defined with minimum_counts and +#' minimum_proportion parameters) are filtered out from the scaling procedure. +#' The scaling inference is then applied back to all unfiltered data. #' #' Underlying method #' @@ -607,7 +640,8 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' ) #' #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data as +#' `_scaled` #' #' #' @examples @@ -622,25 +656,25 @@ setMethod("scale_abundance", "tidybulk", .scale_abundance) #' @rdname quantile_normalise_abundance-methods #' @export -setGeneric("quantile_normalise_abundance", function(.data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - method = "limma_normalize_quantiles", - target_distribution = NULL, - action = "add") +setGeneric("quantile_normalise_abundance", function( + .data, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + method = "limma_normalize_quantiles", + target_distribution = NULL, + action = "add") standardGeneric("quantile_normalise_abundance")) # Set internal -.quantile_normalise_abundance = function(.data, +.quantile_normalise_abundance <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, method = "limma_normalize_quantiles", - target_distribution = NULL, + target_distribution = NULL, + action = "add") { - action = "add") -{ # Fix NOTEs . = NULL @@ -693,7 +727,6 @@ setGeneric("quantile_normalise_abundance", function(.data, install.packages("BiocManager", repos = "https://cloud.r-project.org") BiocManager::install("preprocessCore", ask = FALSE) } - if(is.null(target_distribution)) target_distribution = preprocessCore::normalize.quantiles.determine.target(.data_norm) .data_norm_quant = @@ -741,7 +774,8 @@ setGeneric("quantile_normalise_abundance", function(.data, else if (action == "only") .data_norm else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } @@ -749,25 +783,24 @@ setGeneric("quantile_normalise_abundance", function(.data, #' #' @export #' -#' @inheritParams quantile_normalise_abundance -#' #' @docType methods #' @rdname quantile_normalise_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' -setMethod("quantile_normalise_abundance", "spec_tbl_df", .quantile_normalise_abundance) +setMethod("quantile_normalise_abundance", "spec_tbl_df", + .quantile_normalise_abundance) #' quantile_normalise_abundance #' #' @export #' -#' @inheritParams quantile_normalise_abundance -#' #' @docType methods #' @rdname quantile_normalise_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' setMethod("quantile_normalise_abundance", "tbl_df", .quantile_normalise_abundance) @@ -775,12 +808,11 @@ setMethod("quantile_normalise_abundance", "tbl_df", .quantile_normalise_abundanc #' #' @export #' -#' @inheritParams quantile_normalise_abundance -#' #' @docType methods #' @rdname quantile_normalise_abundance-methods #' -#' @return A tbl object with additional columns with scaled data as `_scaled` +#' @return A tbl object with additional columns with scaled data +#' as `_scaled` #' setMethod("quantile_normalise_abundance", "tidybulk", .quantile_normalise_abundance) @@ -789,29 +821,43 @@ setMethod("quantile_normalise_abundance", "tidybulk", .quantile_normalise_abunda #' #' `r lifecycle::badge("maturing")` #' -#' @description cluster_elements() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and identify clusters in the data. +#' @description cluster_elements() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and identify clusters in the data. #' #' @importFrom rlang enquo #' -#' #' @name cluster_elements #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, +#' feature and transcript abundance) or `SummarizedExperiment` +#' (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) #' @param .element The name of the element column (normally samples). #' @param .feature The name of the feature column (normally transcripts/genes) -#' @param .abundance The name of the column including the numerical value the clustering is based on (normally transcript abundance) -#' -#' @param method A character string. The cluster algorithm to use, at the moment k-means is the only algorithm included. -#' @param of_samples A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param .abundance The name of the column including the numerical value the +#' clustering is based on (normally transcript abundance) +#' +#' @param method A character string. The cluster algorithm to use, at the +#' moment k-means is the only algorithm included. +#' @param of_samples A boolean. In case the input is a tidybulk object, +#' it indicates Whether the element column will be sample or transcript column +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can +#' use identity +#' @param action A character string. Whether to join the new information to +#' the input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param ... Further parameters passed to the function kmeans #' -#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform DEPRECATED - A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' #' @details identifies clusters in the data, normally of samples. #' This function returns a tibble with additional columns for the cluster annotation. -#' At the moment only k-means (DOI: 10.2307/2346830) and SNN clustering (DOI:10.1016/j.cell.2019.05.031) is supported, the plan is to introduce more clustering methods. +#' At the moment only k-means (DOI: 10.2307/2346830) and SNN clustering +#' (DOI:10.1016/j.cell.2019.05.031) is supported, the plan is to introduce more +#' clustering methods. #' #' Underlying method for kmeans #' do.call(kmeans(.data, iter.max = 1000, ...) @@ -845,32 +891,25 @@ setGeneric("cluster_elements", function(.data, method, of_samples = TRUE, transform = log1p, - action = "add", ..., - # DEPRECATED log_transform = NULL ) standardGeneric("cluster_elements")) # Set internal -.cluster_elements = function(.data, - .element = NULL, - .feature = NULL, - .abundance = NULL, - method , - of_samples = TRUE, - transform = log1p, - - action = "add", - ..., - - # DEPRECATED - log_transform = NULL - - ) -{ +.cluster_elements <- function(.data, + .element = NULL, + .feature = NULL, + .abundance = NULL, + method , + of_samples = TRUE, + transform = log1p, + action = "add", + ..., + # DEPRECATED + log_transform = NULL) { # Fix NOTEs . = NULL @@ -879,7 +918,9 @@ setGeneric("cluster_elements", function(.data, if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", + "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform == TRUE) transform = log1p } @@ -911,7 +952,11 @@ setGeneric("cluster_elements", function(.data, when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") + warning("tidybulk says: highly abundant transcripts were not ", + "identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant)", + ", therefore this operation will be performed on unfiltered data. ", + "In rare occasions this could be wanted. In standard ", + "whole-transcriptome workflows is generally unwanted.") (.) } ) %>% @@ -926,7 +971,9 @@ setGeneric("cluster_elements", function(.data, transform = transform, ... ), - method == "SNN" ~ stop("tidybulk says: Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable."), + method == "SNN" ~ stop("tidybulk says: Matrix package (v1.3-3) causes ", + "an error with Seurat::FindNeighbors used in this method. We are ", + "trying to solve this issue. At the moment this option in unaviable."), # get_clusters_SNN_bulk(., # .abundance = !!.abundance, # .element = !!.element, @@ -935,7 +982,7 @@ setGeneric("cluster_elements", function(.data, # transform = transform, # ... # ), - TRUE ~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ") + TRUE ~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ") ) @@ -944,7 +991,7 @@ setGeneric("cluster_elements", function(.data, if (action == "add"){ .data |> - dplyr::left_join( .data_procesed, by=quo_name(.element) ) |> + dplyr::left_join( .data_procesed, by=quo_name(.element) ) |> # Attach attributes reattach_internals(.data) @@ -957,7 +1004,7 @@ setGeneric("cluster_elements", function(.data, # Selecting the right columns pivot_sample(!!.element) |> - dplyr::left_join( .data_procesed, by=quo_name(.element) ) |> + dplyr::left_join( .data_procesed, by=quo_name(.element) ) |> # Attach attributes reattach_internals(.data) @@ -966,13 +1013,13 @@ setGeneric("cluster_elements", function(.data, else if (action == "only") .data_procesed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' cluster_elements -#' @inheritParams cluster_elements #' #' @docType methods #' @rdname cluster_elements-methods @@ -982,7 +1029,6 @@ setGeneric("cluster_elements", function(.data, setMethod("cluster_elements", "spec_tbl_df", .cluster_elements) #' cluster_elements -#' @inheritParams cluster_elements #' #' @docType methods #' @rdname cluster_elements-methods @@ -992,7 +1038,6 @@ setMethod("cluster_elements", "spec_tbl_df", .cluster_elements) setMethod("cluster_elements", "tbl_df", .cluster_elements) #' cluster_elements -#' @inheritParams cluster_elements #' #' @docType methods #' @rdname cluster_elements-methods @@ -1006,28 +1051,45 @@ setMethod("cluster_elements", "tidybulk", .cluster_elements) #' #' `r lifecycle::badge("maturing")` #' -#' @description reduce_dimensions() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and calculates the reduced dimensional space of the transcript abundance. +#' @description reduce_dimensions() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and calculates the reduced dimensional +#' space of the transcript abundance. #' #' @importFrom rlang enquo #' -#' #' @name reduce_dimensions #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, +#' feature and transcript abundance) or `SummarizedExperiment` (more +#' convenient if abstracted to tibble with library(tidySummarizedExperiment)) #' @param .element The name of the element column (normally samples). #' @param .feature The name of the feature column (normally transcripts/genes) -#' @param .abundance The name of the column including the numerical value the clustering is based on (normally transcript abundance) +#' @param .abundance The name of the column including the numerical value +#' the clustering is based on (normally transcript abundance) #' -#' @param method A character string. The dimension reduction algorithm to use (PCA, MDS, tSNE). +#' @param method A character string. The dimension reduction algorithm to +#' use (PCA, MDS, tSNE). #' @param top An integer. How many top genes to select for dimensionality reduction -#' @param of_samples A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column -#' @param .dims An integer. The number of dimensions your are interested in (e.g., 4 for returning the first four principal components). -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity -#' @param scale A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE. -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). -#' @param ... Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE", or uwot::tumap if you choose method="umap" -#' -#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param of_samples A boolean. In case the input is a tidybulk object, +#' it indicates Whether the element column will be sample or transcript column +#' @param .dims An integer. The number of dimensions your are interested in +#' (e.g., 4 for returning the first four principal components). +#' @param transform A function that will tranform the counts, by default +#' it is log1p for RNA sequencing data, but for avoinding tranformation you +#' can use identity +#' @param scale A boolean for method="PCA", this will be passed to the `prcomp` +#' function. It is not included in the ... argument because although the +#' default for `prcomp` if FALSE, it is advisable to set it as TRUE. +#' @param action A character string. Whether to join the new information to +#' the input tbl (add), or just get the non-redundant tbl with the new +#' information (get). +#' @param ... Further parameters passed to the function prcomp if you choose +#' method="PCA" or Rtsne if you choose method="tSNE" +#' +#' @param log_transform DEPRECATED - A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' #' @details This function reduces the dimensions of the transcript abundances. #' It can use multi-dimensional scaling (MDS; DOI.org/10.1186/gb-2010-11-3-r25), @@ -1091,50 +1153,43 @@ setGeneric("reduce_dimensions", function(.data, .abundance = NULL, method, .dims = 2, - top = 500, of_samples = TRUE, transform = log1p, scale = TRUE, action = "add", ..., - # DEPRECATED log_transform = NULL - ) standardGeneric("reduce_dimensions")) # Set internal -.reduce_dimensions = function(.data, - .element = NULL, - .feature = NULL, - .abundance = NULL, - method, - .dims = 2, - - top = 500, - of_samples = TRUE, - transform = log1p, - scale = TRUE, - action = "add", - ..., - - # DEPRECATED - log_transform = NULL - - ) -{ +.reduce_dimensions <- function(.data, + .element = NULL, + .feature = NULL, + .abundance = NULL, + method, + .dims = 2, + top = 500, + of_samples = TRUE, + transform = log1p, + scale = TRUE, + action = "add", + ..., + # DEPRECATED + log_transform = NULL) { # Fix NOTEs . = NULL - - + # DEPRECATION OF log_transform if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", + "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform == TRUE) transform = log1p } @@ -1177,13 +1232,15 @@ setGeneric("reduce_dimensions", function(.data, when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") - (.) - } + warning("tidybulk says: highly abundant transcripts were not ", + "identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), ", + "therefore this operation will be performed on unfiltered data. ", + "In rare occasions this could be wanted. In standard whole-transcriptome ", + "workflows is generally unwanted.") + (.) } ) %>% - when( - tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(., + tolower(method) == tolower("MDS") ~ get_reduced_dimensions_MDS_bulk(., .abundance = !!.abundance, .dims = .dims, .element = !!.element, @@ -1193,7 +1250,7 @@ setGeneric("reduce_dimensions", function(.data, transform = transform, ... ), - tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(., + tolower(method) == tolower("PCA") ~ get_reduced_dimensions_PCA_bulk(., .abundance = !!.abundance, .dims = .dims, .element = !!.element, @@ -1214,16 +1271,17 @@ setGeneric("reduce_dimensions", function(.data, transform = transform, ... ), - tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk(., - .abundance = !!.abundance, - .dims = .dims, - .element = !!.element, - .feature = !!.feature, - top = top, - of_samples = of_samples, - transform = transform, - scale = scale, - ... + tolower(method) == tolower("UMAP") ~ get_reduced_dimensions_UMAP_bulk( + ., + .abundance = !!.abundance, + .dims = .dims, + .element = !!.element, + .feature = !!.feature, + top = top, + of_samples = of_samples, + transform = transform, + scale = scale, + ... ), TRUE ~ stop("tidybulk says: method must be either \"MDS\" or \"PCA\" or \"tSNE\"") ) @@ -1255,14 +1313,14 @@ setGeneric("reduce_dimensions", function(.data, else if (action == "only") .data_processed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' reduce_dimensions -#' @inheritParams reduce_dimensions #' #' @docType methods #' @rdname reduce_dimensions-methods @@ -1271,7 +1329,6 @@ setGeneric("reduce_dimensions", function(.data, setMethod("reduce_dimensions", "spec_tbl_df", .reduce_dimensions) #' reduce_dimensions -#' @inheritParams reduce_dimensions #' #' @docType methods #' @rdname reduce_dimensions-methods @@ -1280,7 +1337,6 @@ setMethod("reduce_dimensions", "spec_tbl_df", .reduce_dimensions) setMethod("reduce_dimensions", "tbl_df", .reduce_dimensions) #' reduce_dimensions -#' @inheritParams reduce_dimensions #' #' @docType methods #' @rdname reduce_dimensions-methods @@ -1293,23 +1349,30 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' #' `r lifecycle::badge("maturing")` #' -#' @description rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. +#' @description rotate_dimensions() takes as input a `tbl` formatted as +#' | | | <...> | and calculates the rotated +#' dimensional space of the transcript abundance. #' #' @importFrom rlang enquo #' -#' #' @name rotate_dimensions #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .element The name of the element column (normally samples). #' #' @param dimension_1_column A character string. The column of the dimension 1 #' @param dimension_2_column A character string. The column of the dimension 2 #' @param rotation_degrees A real number between 0 and 360 -#' @param of_samples A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column -#' @param dimension_1_column_rotated A character string. The column of the rotated dimension 1 (optional) -#' @param dimension_2_column_rotated A character string. The column of the rotated dimension 2 (optional) -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param of_samples A boolean. In case the input is a tidybulk object, +#' it indicates Whether the element column will be sample or transcript column +#' @param dimension_1_column_rotated A character string. The column of the +#' rotated dimension 1 (optional) +#' @param dimension_2_column_rotated A character string. The column of the +#' rotated dimension 2 (optional) +#' @param action A character string. Whether to join the new information to the +#' input tbl (add), or just get the non-redundant tbl with the new information (get). #' #' @details This function to rotate two dimensions such as the reduced dimensions. #' @@ -1324,19 +1387,19 @@ setMethod("reduce_dimensions", "tidybulk", .reduce_dimensions) #' ) |> as_matrix()) %*% m) #' } #' -#' -#' @return A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. -#' +#' @return A tbl object with additional columns for the reduced dimensions. +#' additional columns for the rotated dimensions. The rotated dimensions will +#' be added to the original data set as ` rotated ` +#' by default, or as specified in the input arguments. #' #' @examples -#' #' counts.MDS = #' tidybulk::se_mini |> #' identify_abundant() |> #' reduce_dimensions( method="MDS", .dims = 3) #' -#' counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, rotation_degrees = 45, .element = sample) -#' +#' counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, +#' rotation_degrees = 45, .element = sample) #' #' @docType methods #' @rdname rotate_dimensions-methods @@ -1354,7 +1417,7 @@ setGeneric("rotate_dimensions", function(.data, standardGeneric("rotate_dimensions")) # Set internal -.rotate_dimensions = function(.data, +.rotate_dimensions <- function(.data, dimension_1_column, dimension_2_column, rotation_degrees, @@ -1362,8 +1425,7 @@ setGeneric("rotate_dimensions", function(.data, of_samples = TRUE, dimension_1_column_rotated = NULL, dimension_2_column_rotated = NULL, - action = "add") -{ + action = "add") { # Fix NOTEs . = NULL @@ -1406,13 +1468,11 @@ setGeneric("rotate_dimensions", function(.data, ) if (action == "add"){ - .data |> dplyr::left_join( .data_processed, by = quo_name(.element) ) |> # Attach attributes reattach_internals(.data) - } else if (action == "get"){ @@ -1434,75 +1494,104 @@ setGeneric("rotate_dimensions", function(.data, else if (action == "only") .data_processed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' rotate_dimensions -#' @inheritParams rotate_dimensions #' #' @docType methods #' @rdname rotate_dimensions-methods #' -#' @return A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. +#' @return A tbl object with additional columns for the reduced dimensions. +#' additional columns for the rotated dimensions. The rotated dimensions will +#' be added to the original data set as ` rotated ` +#' by default, or as specified in the input arguments. setMethod("rotate_dimensions", "spec_tbl_df", .rotate_dimensions) #' rotate_dimensions -#' @inheritParams rotate_dimensions #' #' @docType methods #' @rdname rotate_dimensions-methods #' -#' @return A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. +#' @return A tbl object with additional columns for the reduced dimensions. +#' additional columns for the rotated dimensions. The rotated dimensions will +#' be added to the original data set as ` rotated ` +#' by default, or as specified in the input arguments. setMethod("rotate_dimensions", "tbl_df", .rotate_dimensions) #' rotate_dimensions -#' @inheritParams rotate_dimensions #' #' @docType methods #' @rdname rotate_dimensions-methods #' -#' @return A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. +#' @return A tbl object with additional columns for the reduced dimensions. +#' additional columns for the rotated dimensions. The rotated dimensions +#' will be added to the original data set as +#' ` rotated ` by default, or as +#' specified in the input arguments. setMethod("rotate_dimensions", "tidybulk", .rotate_dimensions) -#' Drop redundant elements (e.g., samples) for which feature (e.g., transcript/gene) abundances are correlated +#' Drop redundant elements (e.g., samples) for which feature (e.g., +#' transcript/gene) abundances are correlated #' #' `r lifecycle::badge("maturing")` #' -#' @description remove_redundancy() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) for correlation method or | | | <...> | for reduced_dimensions method, and returns a consistent object (to the input) with dropped elements (e.g., samples). +#' @description remove_redundancy() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) for correlation method or | +#' | | <...> | for reduced_dimensions method, +#' and returns a consistent object (to the input) with dropped +#' elements (e.g., samples). #' #' @importFrom rlang enquo #' -#' #' @name remove_redundancy #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .element The name of the element column (normally samples). #' @param .feature The name of the feature column (normally transcripts/genes) -#' @param .abundance The name of the column including the numerical value the clustering is based on (normally transcript abundance) -#' -#' @param method A character string. The method to use, correlation and reduced_dimensions are available. The latter eliminates one of the most proximar pairs of samples in PCA reduced dimensions. -#' @param of_samples A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param .abundance The name of the column including the numerical value the +#' clustering is based on (normally transcript abundance) +#' +#' @param method A character string. The method to use, correlation and +#' reduced_dimensions are available. The latter eliminates one of the most +#' proximar pairs of samples in PCA reduced dimensions. +#' @param of_samples A boolean. In case the input is a tidybulk object, +#' it indicates Whether the element column will be sample or transcript column +#' @param transform A function that will tranform the counts, by default it +#' is log1p for RNA sequencing data, but for avoinding tranformation you +#' can use identity #' @param correlation_threshold A real number between 0 and 1. For correlation based calculation. #' @param top An integer. How many top genes to select for correlation based method -#' @param Dim_a_column A character string. For reduced_dimension based calculation. The column of one principal component -#' @param Dim_b_column A character string. For reduced_dimension based calculation. The column of another principal component -#' -#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) -#' -#' @details This function removes redundant elements from the original data set (e.g., samples or transcripts). -#' For example, if we want to define cell-type specific signatures with low sample redundancy. +#' @param Dim_a_column A character string. For reduced_dimension based +#' calculation. The column of one principal component +#' @param Dim_b_column A character string. For reduced_dimension based +#' calculation. The column of another principal component +#' +#' @param log_transform DEPRECATED - A boolean, whether the value +#' should be log-transformed (e.g., TRUE for RNA sequencing data) +#' +#' @details This function removes redundant elements from the original +#' data set (e.g., samples or transcripts). +#' For example, if we want to define cell-type specific signatures with +#' low sample redundancy. #' This function returns a tibble with dropped redundant elements (e.g., samples). #' Two redundancy estimation approaches are supported: -#' (i) removal of highly correlated clusters of elements (keeping a representative) with method="correlation"; +#' (i) removal of highly correlated clusters of elements (keeping a +#' representative) with method="correlation"; #' (ii) removal of most proximal element pairs in a reduced dimensional space. #' #' Underlying method for correlation: -#' widyr::pairwise_cor(sample, transcript,count, sort = TRUE, diag = FALSE, upper = FALSE) +#' widyr::pairwise_cor(sample, transcript,count, sort = TRUE, +#' diag = FALSE, upper = FALSE) #' #' Underlying custom method for reduced dimensions: #' select_closest_pairs = function(df) { @@ -1519,18 +1608,13 @@ setMethod("rotate_dimensions", "tidybulk", .rotate_dimensions) #' !`sample 2` %in% (pair |> select(1:2) |> as.character()) #' ) #' } -#' #' couples -#' #' } #' #' -#' #' @return A tbl object with with dropped redundant elements (e.g., samples). #' #' @examples -#' -#' #' tidybulk::se_mini |> #' identify_abundant() |> #' remove_redundancy( @@ -1578,25 +1662,19 @@ setGeneric("remove_redundancy", function(.data, standardGeneric("remove_redundancy")) # Set internal -.remove_redundancy = function(.data, - .element = NULL, - .feature = NULL, - .abundance = NULL, - method, - - of_samples = TRUE, - - correlation_threshold = 0.9, - top = Inf, - transform = identity, - - Dim_a_column = NULL, - Dim_b_column = NULL, - - # DEPRECATED - log_transform = NULL -) -{ +.remove_redundancy <- function(.data, + .element = NULL, + .feature = NULL, + .abundance = NULL, + method, + of_samples = TRUE, + correlation_threshold = 0.9, + top = Inf, + transform = identity, + Dim_a_column = NULL, + Dim_b_column = NULL, + # DEPRECATED + log_transform = NULL) { # Fix NOTEs . = NULL @@ -1605,7 +1683,9 @@ setGeneric("remove_redundancy", function(.data, if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", + "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform == TRUE) transform = log1p } @@ -1651,13 +1731,14 @@ setGeneric("remove_redundancy", function(.data, } else stop( - "tidybulk says: method must be either \"correlation\" for dropping correlated elements or \"reduced_dimension\" to drop the closest pair according to two dimensions (e.g., PCA)" + "tidybulk says: method must be either \"correlation\" for dropping ", + "correlated elements or \"reduced_dimension\" to drop the closest pair ", + "according to two dimensions (e.g., PCA)" ) } #' remove_redundancy -#' @inheritParams remove_redundancy #' #' @docType methods #' @rdname remove_redundancy-methods @@ -1666,7 +1747,6 @@ setGeneric("remove_redundancy", function(.data, setMethod("remove_redundancy", "spec_tbl_df", .remove_redundancy) #' remove_redundancy -#' @inheritParams remove_redundancy #' #' @docType methods #' @rdname remove_redundancy-methods @@ -1675,7 +1755,6 @@ setMethod("remove_redundancy", "spec_tbl_df", .remove_redundancy) setMethod("remove_redundancy", "tbl_df", .remove_redundancy) #' remove_redundancy -#' @inheritParams remove_redundancy #' #' @docType methods #' @rdname remove_redundancy-methods @@ -1689,51 +1768,68 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' `r lifecycle::badge("maturing")` #' -#' @description adjust_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with an additional adjusted abundance column. This method uses scaled counts if present. +#' @description adjust_abundance() takes as input A `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with an additional adjusted abundance column. +#' This method uses scaled counts if present. #' #' @importFrom rlang enquo #' -#' #' @name adjust_abundance #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .factor_unwanted A tidy select, e.g. column names without double quotation. c(batch, country) These are the factor that we want to adjust for, including unwanted batcheffect, and unwanted biological effects. -#' @param .factor_of_interest A tidy select, e.g. column names without double quotation. c(treatment) These are the factor that we want to preserve. +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .factor_unwanted A tidy select, e.g. column names without double +#' quotation. c(batch, country) These are the factor that we want to adjust for, +#' including unwanted batcheffect, and unwanted biological effects. +#' @param .factor_of_interest A tidy select, e.g. column names without double +#' quotation. c(treatment) These are the factor that we want to preserve. #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A character string. Methods include combat_seq (default), combat and limma_remove_batch_effect. +#' @param method A character string. Methods include combat_seq (default), +#' combat and limma_remove_batch_effect. #' -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param action A character string. Whether to join the new information to the +#' input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param ... Further parameters passed to the function sva::ComBat #' -#' @param .formula DEPRECATED - A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch) -#' @param transform DEPRECATED - A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity -#' @param inverse_transform DEPRECATED - A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis. -#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param .formula DEPRECATED - A formula with no response variable, +#' representing the desired linear model where the first covariate is the +#' factor of interest and the second covariate is the unwanted variation +#' (of the kind ~ factor_of_interest + batch) +#' @param transform DEPRECATED - A function that will tranform the counts, +#' by default it is log1p for RNA sequencing data, but for avoinding +#' tranformation you can use identity +#' @param inverse_transform DEPRECATED - A function that is the inverse of +#' transform (e.g. expm1 is inverse of log1p). This is needed to tranform +#' back the counts after analysis. +#' @param log_transform DEPRECATED - A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' #' @details This function adjusts the abundance for (known) unwanted variation. -#' At the moment just an unwanted covariate is allowed at a time using Combat (DOI: 10.1093/bioinformatics/bts034) +#' At the moment just an unwanted covariate is allowed at a time using +#' Combat (DOI: 10.1093/bioinformatics/bts034) #' #' Underlying method: #' sva::ComBat(data, batch = my_batch, mod = design, prior.plots = FALSE, ...) #' -#' @return A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` -#' -#' -#' +#' @return A consistent object (to the input) with additional columns for +#' the adjusted counts as `_adjusted` #' #' @examples #' -#' -#' #' cm = tidybulk::se_mini #' cm$batch = 0 #' cm$batch[colnames(cm) %in% c("SRR1740035", "SRR1740043")] = 1 #' #' cm |> #' identify_abundant() |> -#' adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) +#' adjust_abundance( .factor_unwanted = batch, +#' .factor_of_interest = condition, method="combat" ) #' #' #' @docType methods @@ -1742,7 +1838,6 @@ setMethod("remove_redundancy", "tidybulk", .remove_redundancy) #' #' setGeneric("adjust_abundance", function(.data, - # DEPRECATED .formula = NULL, .factor_unwanted =NULL, @@ -1759,13 +1854,11 @@ setGeneric("adjust_abundance", function(.data, log_transform = NULL, transform = NULL, inverse_transform = NULL - ) standardGeneric("adjust_abundance")) # Set internal -.adjust_abundance = function(.data, - +.adjust_abundance <- function(.data, # DEPRECATED .formula = NULL, .factor_unwanted = NULL, @@ -1781,8 +1874,7 @@ setGeneric("adjust_abundance", function(.data, log_transform = NULL, transform = NULL, inverse_transform = NULL - ) -{ + ) { # Fix NOTEs . = NULL @@ -1798,7 +1890,9 @@ setGeneric("adjust_abundance", function(.data, if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", + "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform){ transform = log1p @@ -1813,7 +1907,8 @@ setGeneric("adjust_abundance", function(.data, ) { # Signal the deprecation to the user - deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", details = "The argument transform and inverse_transform is now deprecated, please use method argument instead specifying \"combat\", \"combat_seq\" or \"limma_remove_batch_effect\".") + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", + details = "The argument transform and inverse_transform is now deprecated, please use method argument instead specifying \"combat\", \"combat_seq\" or \"limma_remove_batch_effect\".") } @@ -1821,7 +1916,8 @@ setGeneric("adjust_abundance", function(.data, if (is_present(.formula) & !is.null(.formula)) { # Signal the deprecation to the user - deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", details = "The argument .formula is now deprecated, please use factor_unwanted and factor_of_interest. Using the formula, the first factor is of interest and the second is unwanted") + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", + details = "The argument .formula is now deprecated, please use factor_unwanted and factor_of_interest. Using the formula, the first factor is of interest and the second is unwanted") # Check that .formula includes at least two covariates if (parse_formula(.formula) %>% length %>% st(2)) @@ -1865,9 +1961,12 @@ setGeneric("adjust_abundance", function(.data, when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") - (.) - } + warning("tidybulk says: highly abundant transcripts were not ", + "identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), ", + "therefore this operation will be performed on unfiltered data. ", + "In rare occasions this could be wanted. In standard whole-transcriptome ", + "workflows is generally unwanted.") + (.) } ) |> get_adjusted_counts_for_unwanted_variation_bulk( @@ -1885,7 +1984,8 @@ setGeneric("adjust_abundance", function(.data, .data |> # Add adjusted column - dplyr::left_join(.data_processed, by = c(quo_name(.transcript), quo_name(.sample))) |> + dplyr::left_join(.data_processed, by = c(quo_name(.transcript), + quo_name(.sample))) |> # Attach attributes reattach_internals(.data) @@ -1914,56 +2014,64 @@ setGeneric("adjust_abundance", function(.data, else if (action == "only") .data_processed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this information ", + "to your data frame or \"get\" to just get the information" ) } #' adjust_abundance -#' @inheritParams adjust_abundance #' #' @docType methods #' @rdname adjust_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +#' @return A consistent object (to the input) with additional columns for the +#' adjusted counts as `_adjusted` setMethod("adjust_abundance", "spec_tbl_df", .adjust_abundance) #' adjust_abundance -#' @inheritParams adjust_abundance #' #' @docType methods #' @rdname adjust_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +#' @return A consistent object (to the input) with additional columns for the +#' adjusted counts as `_adjusted` setMethod("adjust_abundance", "tbl_df", .adjust_abundance) #' adjust_abundance -#' @inheritParams adjust_abundance #' #' @docType methods #' @rdname adjust_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +#' @return A consistent object (to the input) with additional columns for the +#' adjusted counts as `_adjusted` setMethod("adjust_abundance", "tidybulk", .adjust_abundance) -#' Aggregates multiple counts from the same samples (e.g., from isoforms), concatenates other character columns, and averages other numeric columns +#' Aggregates multiple counts from the same samples (e.g., from isoforms), +#' concatenates other character columns, and averages other numeric columns #' #' `r lifecycle::badge("maturing")` #' -#' @description aggregate_duplicates() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with aggregated transcripts that were duplicated. +#' @description aggregate_duplicates() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with aggregated transcripts that were duplicated. #' #' @importFrom rlang enquo #' -#' #' @name aggregate_duplicates #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column #' -#' @param aggregation_function A function for counts aggregation (e.g., sum, median, or mean) +#' @param aggregation_function A function for counts aggregation (e.g., sum, +#' median, or mean) #' @param keep_integer A boolean. Whether to force the aggregated counts to integer #' #' @details This function aggregates duplicated transcripts (e.g., isoforms, ensembl). @@ -1979,7 +2087,8 @@ setMethod("adjust_abundance", "tidybulk", .adjust_abundance) #' group_by(!!.sample,!!.transcript) |> #' dplyr::mutate(!!.abundance := !!.abundance |> aggregation_function()) #' -#' @return A consistent object (to the input) with aggregated transcript abundance and annotation +#' @return A consistent object (to the input) with aggregated transcript +#' abundance and annotation #' #' #' @@ -2002,7 +2111,6 @@ setMethod("adjust_abundance", "tidybulk", .adjust_abundance) #' #' setGeneric("aggregate_duplicates", function(.data, - .sample = NULL, .transcript = NULL, .abundance = NULL, @@ -2011,8 +2119,7 @@ setGeneric("aggregate_duplicates", function(.data, standardGeneric("aggregate_duplicates")) # Set internal -.aggregate_duplicates = function(.data, - +.aggregate_duplicates <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, @@ -2054,7 +2161,8 @@ setGeneric("aggregate_duplicates", function(.data, # If I have a big data set else { - message("tidybulk says: for big data sets (>100 samples) this efficient implementation aggregates count columns and keeps the first instance for sample and transcript annotations") + message("tidybulk says: for big data sets (>100 samples) this efficient ", + "implementation aggregates count columns and keeps the first instance for sample and transcript annotations") aggregate_duplicated_transcripts_DT( .data, @@ -2069,30 +2177,30 @@ setGeneric("aggregate_duplicates", function(.data, } #' aggregate_duplicates -#' @inheritParams aggregate_duplicates #' #' @docType methods #' @rdname aggregate_duplicates-methods #' -#' @return A consistent object (to the input) with aggregated transcript abundance and annotation +#' @return A consistent object (to the input) with aggregated transcript +#' abundance and annotation setMethod("aggregate_duplicates", "spec_tbl_df", .aggregate_duplicates) #' aggregate_duplicates -#' @inheritParams aggregate_duplicates #' #' @docType methods #' @rdname aggregate_duplicates-methods #' -#' @return A consistent object (to the input) with aggregated transcript abundance and annotation +#' @return A consistent object (to the input) with aggregated transcript +#' abundance and annotation setMethod("aggregate_duplicates", "tbl_df", .aggregate_duplicates) #' aggregate_duplicates -#' @inheritParams aggregate_duplicates #' #' @docType methods #' @rdname aggregate_duplicates-methods #' -#' @return A consistent object (to the input) with aggregated transcript abundance and annotation +#' @return A consistent object (to the input) with aggregated transcript +#' abundance and annotation setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates) @@ -2101,21 +2209,35 @@ setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates) #' #' `r lifecycle::badge("maturing")` #' -#' @description deconvolve_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with the estimated cell type abundance for each sample +#' @description deconvolve_cellularity() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object (to the +#' input) with the estimated cell type abundance for each sample #' #' @importFrom rlang enquo #' -#' #' @name deconvolve_cellularity #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param reference A data frame. The methods cibersort and llsr can accept a custom rectangular dataframe with genes as rows names, cell types as column names and gene-transcript abundance as values. For exampler tidybulk::X_cibersort. The transcript/cell_type data frame of integer transcript abundance. If NULL, the default reference for each algorithm will be used. For llsr will be LM22. -#' @param method A character string. The method to be used. At the moment Cibersort (default, can accept custom reference), epic (can accept custom reference) and llsr (linear least squares regression, can accept custom reference), mcp_counter, quantiseq, xcell are available. -#' @param prefix A character string. The prefix you would like to add to the result columns. It is useful if you want to reshape data. -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param reference A data frame. The methods cibersort and llsr can accept a +#' custom rectangular dataframe with genes as rows names, cell types as column +#' names and gene-transcript abundance as values. For exampler tidybulk::X_cibersort. +#' The transcript/cell_type data frame of integer transcript abundance. If NULL, +#' the default reference for each algorithm will be used. For llsr will be LM22. +#' @param method A character string. The method to be used. At the moment +#' Cibersort (default, can accept custom reference), epic (can accept custom +#' reference) and llsr (linear least squares regression, can accept custom +#' reference), mcp_counter, quantiseq, xcell are available. +#' @param prefix A character string. The prefix you would like to add to the +#' result columns. It is useful if you want to reshape data. +#' @param action A character string. Whether to join the new information to the +#' input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param ... Further parameters passed to the function Cibersort #' #' @details This function infers the cell type composition of our samples @@ -2124,14 +2246,11 @@ setMethod("aggregate_duplicates", "tidybulk", .aggregate_duplicates) #' Underlying method: #' CIBERSORT(Y = data, X = reference, ...) #' -#' @return A consistent object (to the input) including additional columns for each cell type estimated -#' -#' -#' +#' @return A consistent object (to the input) including additional columns +#' for each cell type estimated #' #' @examples #' -#' #' # Subsetting for time efficiency #' tidybulk::se_mini |> deconvolve_cellularity(cores = 1) #' @@ -2152,7 +2271,7 @@ setGeneric("deconvolve_cellularity", function(.data, standardGeneric("deconvolve_cellularity")) # Set internal -.deconvolve_cellularity = function(.data, +.deconvolve_cellularity <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, @@ -2198,9 +2317,8 @@ setGeneric("deconvolve_cellularity", function(.data, if (action == "add"){ .data |> - # Add new annotation - dplyr::left_join(.data_processed, by = quo_name(.sample) ) |> + dplyr::left_join(.data_processed, by = quo_name(.sample)) |> # Attach attributes reattach_internals(.data_processed) @@ -2208,8 +2326,6 @@ setGeneric("deconvolve_cellularity", function(.data, else if (action == "get"){ .data |> - - # Selecting the right columns pivot_sample(!!.sample) |> # @@ -2220,7 +2336,7 @@ setGeneric("deconvolve_cellularity", function(.data, # distinct() |> # Add new annotation - dplyr::left_join(.data_processed, by = quo_name(.sample) ) |> + dplyr::left_join(.data_processed, by = quo_name(.sample)) |> # Attach attributes reattach_internals(.data_processed) @@ -2229,44 +2345,43 @@ setGeneric("deconvolve_cellularity", function(.data, else if (action == "only") .data_processed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' deconvolve_cellularity -#' @inheritParams deconvolve_cellularity #' #' @docType methods #' @rdname deconvolve_cellularity-methods #' -#' @return A consistent object (to the input) including additional columns for each cell type estimated +#' @return A consistent object (to the input) including additional columns +#' for each cell type estimated setMethod("deconvolve_cellularity", "spec_tbl_df", .deconvolve_cellularity) #' deconvolve_cellularity -#' @inheritParams deconvolve_cellularity #' #' @docType methods #' @rdname deconvolve_cellularity-methods #' -#' @return A consistent object (to the input) including additional columns for each cell type estimated +#' @return A consistent object (to the input) including additional columns +#' for each cell type estimated setMethod("deconvolve_cellularity", "tbl_df", .deconvolve_cellularity) #' deconvolve_cellularity -#' @inheritParams deconvolve_cellularity #' #' @docType methods #' @rdname deconvolve_cellularity-methods #' -#' @return A consistent object (to the input) including additional columns for each cell type estimated +#' @return A consistent object (to the input) including additional columns +#' for each cell type estimated setMethod("deconvolve_cellularity", "tidybulk", .deconvolve_cellularity) - - #' Get ENTREZ id from gene SYMBOL #' #' @param .data A tt or tbl object. @@ -2280,7 +2395,8 @@ setMethod("deconvolve_cellularity", #' # This function was designed for data.frame #' # Convert from SummarizedExperiment for this example. It is NOT reccomended. #' -#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> symbol_to_entrez(.transcript = .feature, .sample = .sample) +#' tidybulk::se_mini |> tidybulk() |> as_tibble() |> +#' symbol_to_entrez(.transcript = .feature, .sample = .sample) #' #' @export #' @@ -2432,7 +2548,6 @@ setGeneric("describe_transcript", function(.data, #' describe_transcript -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods @@ -2441,7 +2556,6 @@ setGeneric("describe_transcript", function(.data, setMethod("describe_transcript", "spec_tbl_df", .describe_transcript) #' describe_transcript -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods @@ -2450,7 +2564,6 @@ setMethod("describe_transcript", "spec_tbl_df", .describe_transcript) setMethod("describe_transcript", "tbl_df", .describe_transcript) #' describe_transcript -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods @@ -2463,29 +2576,36 @@ setMethod("describe_transcript", "tidybulk", .describe_transcript) #' #' \lifecycle{questioning} #' -#' @description ensembl_to_symbol() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with the additional transcript symbol column +#' @description ensembl_to_symbol() takes as input a `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with the additional transcript symbol column #' #' @importFrom rlang enquo #' -#' #' @name ensembl_to_symbol #' -#' @param .data a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .ensembl A character string. The column that is represents ensembl gene id -#' -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param .data a `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .ensembl A character string. The column that is represents +#' ensembl gene id #' -#' @details This is useful since different resources use ensembl IDs while others use gene symbol IDs. At the moment this work for human (genes and transcripts) and mouse (genes) data. -#' -#' @return A consistent object (to the input) including additional columns for transcript symbol +#' @param action A character string. Whether to join the new information +#' to the input tbl (add), or just get the non-redundant tbl with the new +#' information (get). #' +#' @details This is useful since different resources use ensembl IDs while +#' others use gene symbol IDs. At the moment this work for human (genes and +#' transcripts) and mouse (genes) data. #' +#' @return A consistent object (to the input) including additional columns +#' for transcript symbol #' #' #' @examples #' -#' -#' #' # This function was designed for data.frame #' # Convert from SummarizedExperiment for this example. It is NOT reccomended. #' @@ -2504,10 +2624,9 @@ setGeneric("ensembl_to_symbol", function(.data, standardGeneric("ensembl_to_symbol")) # Set internal -.ensembl_to_symbol = function(.data, +.ensembl_to_symbol <- function(.data, .ensembl, - action = "add") -{ + action = "add") { # Fix NOTEs . = NULL @@ -2544,71 +2663,106 @@ setGeneric("ensembl_to_symbol", function(.data, else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' ensembl_to_symbol -#' @inheritParams ensembl_to_symbol #' #' @docType methods #' @rdname ensembl_to_symbol-methods #' -#' @return A consistent object (to the input) including additional columns for transcript symbol +#' @return A consistent object (to the input) including additional columns +#' for transcript symbol setMethod("ensembl_to_symbol", "spec_tbl_df", .ensembl_to_symbol) #' ensembl_to_symbol -#' @inheritParams ensembl_to_symbol #' #' @docType methods #' @rdname ensembl_to_symbol-methods #' -#' @return A consistent object (to the input) including additional columns for transcript symbol +#' @return A consistent object (to the input) including additional columns +#' for transcript symbol setMethod("ensembl_to_symbol", "tbl_df", .ensembl_to_symbol) #' ensembl_to_symbol -#' @inheritParams ensembl_to_symbol #' #' @docType methods #' @rdname ensembl_to_symbol-methods #' -#' @return A consistent object (to the input) including additional columns for transcript symbol +#' @return A consistent object (to the input) including additional columns +#' for transcript symbol setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) -#' Perform differential transcription testing using edgeR quasi-likelihood (QLT), edgeR likelihood-ratio (LR), limma-voom, limma-voom-with-quality-weights or DESeq2 +#' Perform differential transcription testing using edgeR quasi-likelihood +#' (QLT), edgeR likelihood-ratio (LR), limma-voom, +#' limma-voom-with-quality-weights or DESeq2 #' #' `r lifecycle::badge("maturing")` #' -#' @description test_differential_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description test_differential_abundance() takes as input A `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object (to the +#' input) with additional columns for the statistics from the hypothesis test. #' #' @importFrom rlang enquo #' -#' #' @name test_differential_abundance #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula representing the desired linear model. If there is more than one factor, they should be in the order factor of interest + additional factors. +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula representing the desired linear model. If there +#' is more than one factor, they should be in the order factor of interest + +#' additional factors. #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column #' -#' @param contrasts This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT), "edger_robust_likelihood_ratio", "DESeq2", "limma_voom", "limma_voom_sample_weights", "glmmseq_lme4", "glmmseq_glmmtmb" -#' @param test_above_log2_fold_change A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. -#' @param scaling_method A character string. The scaling method passed to the back-end functions: edgeR and limma-voom (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile"). Setting the parameter to \"none\" will skip the compensation for sequencing-depth for the method edgeR or limma-voom. -#' @param omit_contrast_in_colnames If just one contrast is specified you can choose to omit the contrast label in the colnames. -#' @param prefix A character string. The prefix you would like to add to the result columns. It is useful if you want to compare several methods. -#' @param action A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get). +#' @param contrasts This parameter takes the format of the contrast parameter +#' of the method of choice. For edgeR and limma-voom is a character vector. +#' For DESeq2 is a list including a character vector of length three. The first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param method A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +#' "edgeR_likelihood_ratio" (i.e., LRT), "edger_robust_likelihood_ratio", +#' "DESeq2", "limma_voom", "limma_voom_sample_weights" +#' @param test_above_log2_fold_change A positive real value. This works for edgeR +#' and limma_voom methods. It uses the `treat` function, which tests that the +#' difference in abundance is bigger than this threshold rather than zero +#' \url{https://pubmed.ncbi.nlm.nih.gov/19176553}. +#' @param scaling_method A character string. The scaling method passed to the +#' back-end functions: edgeR and limma-voom (i.e., edgeR::calcNormFactors; +#' "TMM","TMMwsp","RLE","upperquartile"). Setting the parameter to \"none\" +#' will skip the compensation for sequencing-depth for the method edgeR or limma-voom. +#' @param omit_contrast_in_colnames If just one contrast is specified you can +#' choose to omit the contrast label in the colnames. +#' @param prefix A character string. The prefix you would like to add to the +#' result columns. It is useful if you want to compare several methods. +#' @param action A character string. Whether to join the new information to the +#' input tbl (add), or just get the non-redundant tbl with the new information (get). #' @param significance_threshold DEPRECATED - A real between 0 and 1 (usually 0.05). -#' @param fill_missing_values DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed. -#' @param .contrasts DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param ... Further arguments passed to some of the internal experimental functions. For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. .scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance. -#' -#' -#' @details This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. -#' All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula. +#' @param fill_missing_values DEPRECATED - A boolean. Whether to fill missing +#' sample/transcript values with the median of the transcript. This is rarely needed. +#' @param .contrasts DEPRECATED - This parameter takes the format of the +#' contrast parameter of the method of choice. For edgeR and limma-voom is a +#' character vector. For DESeq2 is a list including a character vector of length +#' three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param ... Further arguments passed to some of the internal functions. +#' Currently, it is needed just for internal debug. +#' +#' +#' @details This function provides the option to use edgeR +#' \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom +#' \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights +#' \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 +#' \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. +#' All methods use raw counts, irrespective of if scale_abundance or +#' adjust_abundance have been calculated, therefore it is essential to add +#' covariates such as batch effects (if applicable) in the formula. #' #' Underlying method for edgeR framework: #' @@ -2674,13 +2828,10 @@ setMethod("ensembl_to_symbol", "tidybulk", .ensembl_to_symbol) #' progress = TRUE, #' method = method |> str_remove("(?i)^glmmSeq_" ), #' ) -#' -#' + #' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). #' #' -#' -#' #' @examples #' #' # edgeR @@ -2770,7 +2921,7 @@ setGeneric("test_differential_abundance", function(.data, # Set internal #' @importFrom rlang inform -.test_differential_abundance = function(.data, +.test_differential_abundance <- function(.data, .formula, .sample = NULL, .transcript = NULL, @@ -2781,16 +2932,13 @@ setGeneric("test_differential_abundance", function(.data, scaling_method = "TMM", omit_contrast_in_colnames = FALSE, prefix = "", - action = "add", ..., - # DEPRECATED significance_threshold = NULL, fill_missing_values = NULL, .contrasts = NULL - ) -{ + ) { # Fix NOTEs . = NULL @@ -2808,7 +2956,9 @@ setGeneric("test_differential_abundance", function(.data, if (is_present(significance_threshold) & !is.null(significance_threshold)) { # Signal the deprecation to the user - deprecate_warn("1.1.7", "tidybulk::test_differential_abundance(significance_threshold = )", details = "The argument significance_threshold is now deprecated, tigether with the column significance.") + deprecate_warn("1.1.7", + "tidybulk::test_differential_abundance(significance_threshold = )", + details = "The argument significance_threshold is now deprecated, tigether with the column significance.") } @@ -2816,15 +2966,17 @@ setGeneric("test_differential_abundance", function(.data, if (is_present(fill_missing_values) & !is.null(significance_threshold)) { # Signal the deprecation to the user - deprecate_warn("1.1.7", "tidybulk::test_differential_abundance(fill_missing_values = )", details = "The argument fill_missing_values is now deprecated, you will receive a warning/error instead. Please use externally the methods fill_missing_abundance or impute_missing_abundance instead.") - + deprecate_warn("1.1.7", + "tidybulk::test_differential_abundance(fill_missing_values = )", + details = "The argument fill_missing_values is now deprecated, you will receive a warning/error instead. Please use externally the methods fill_missing_abundance or impute_missing_abundance instead.") } # DEPRECATION OF .constrasts if (is_present(.contrasts) & !is.null(.contrasts)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", + details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") contrasts = .contrasts } @@ -2853,7 +3005,10 @@ such as batch effects (if applicable) in the formula. when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") + warning("tidybulk says: highly abundant transcripts were not identified ", + "(i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore ", + "this operation will be performed on unfiltered data. In rare occasions ", + "this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") (.) } ) %>% @@ -2862,7 +3017,8 @@ such as batch effects (if applicable) in the formula. when( # edgeR - tolower(method) %in% c("edger_quasi_likelihood", "edger_likelihood_ratio", "edger_robust_likelihood_ratio") ~ + tolower(method) %in% c("edger_quasi_likelihood", "edger_likelihood_ratio", + "edger_robust_likelihood_ratio") ~ get_differential_transcript_abundance_bulk( ., .formula, @@ -2927,7 +3083,10 @@ such as batch effects (if applicable) in the formula. ), # Else error - TRUE ~ stop("tidybulk says: the only methods supported at the moment are \"edgeR_quasi_likelihood\" (i.e., QLF), \"edgeR_likelihood_ratio\" (i.e., LRT), \"limma_voom\", \"limma_voom_sample_weights\", \"DESeq2\", \"glmmseq_lme4\", \"glmmseq_glmmTMB\"") + TRUE ~ stop('tidybulk says: the only methods supported at the moment are', + '"edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio"', + '(i.e., LRT), "limma_voom", "limma_voom_sample_weights", ', + '"DESeq2", "glmmseq_lme4", "glmmseq_glmmTMB"') ) @@ -2956,39 +3115,42 @@ such as batch effects (if applicable) in the formula. else if (action == "only") .data_processed else stop( - "tidybulk says: action must be either \"add\" for adding this information to your data frame or \"get\" to just get the information" + "tidybulk says: action must be either \"add\" for adding this ", + "information to your data frame or \"get\" to just get the information" ) } #' test_differential_abundance -#' @inheritParams test_differential_abundance #' #' @docType methods #' @rdname test_differential_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the test (e.g., log fold change, p-value and false discovery rate). setMethod("test_differential_abundance", "spec_tbl_df", .test_differential_abundance) #' test_differential_abundance -#' @inheritParams test_differential_abundance #' #' @docType methods #' @rdname test_differential_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("test_differential_abundance", "tbl_df", .test_differential_abundance) #' test_differential_abundance -#' @inheritParams test_differential_abundance #' #' @docType methods #' @rdname test_differential_abundance-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("test_differential_abundance", "tidybulk", .test_differential_abundance) @@ -3001,25 +3163,37 @@ setMethod("test_differential_abundance", #' #' `r lifecycle::badge("maturing")` #' -#' @description keep_variable() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description keep_variable() takes as input A `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional columns for the statistics from the +#' hypothesis test. #' #' @importFrom rlang enquo #' -#' #' @name keep_variable #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column #' @param top Integer. Number of top transcript to consider -#' @param transform A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity +#' @param transform A function that will tranform the counts, by default it is +#' log1p for RNA sequencing data, but for avoinding tranformation you can use +#' identity #' -#' @param log_transform DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data) +#' @param log_transform DEPRECATED - A boolean, whether the value should be +#' log-transformed (e.g., TRUE for RNA sequencing data) #' -#' @details At the moment this function uses edgeR \url{https://doi.org/10.1093/bioinformatics/btp616} +#' @details At the moment this function uses edgeR +#' \url{https://doi.org/10.1093/bioinformatics/btp616} #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). #' #' Underlying method: #' s <- rowMeans((x - rowMeans(x)) ^ 2) @@ -3027,12 +3201,8 @@ setMethod("test_differential_abundance", #' x <- x[o[1L:top], , drop = FALSE] #' variable_trancripts = rownames(x) #' -#' -#' #' @examples #' -#' -#' #' keep_variable(tidybulk::se_mini, top = 500) #' #' @@ -3053,16 +3223,15 @@ setGeneric("keep_variable", function(.data, standardGeneric("keep_variable")) # Set internal -.keep_variable = function(.data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - top = 500, - transform = log1p, - - # DEPRECATED - log_transform = NULL) -{ +.keep_variable <- function(.data, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + top = 500, + transform = log1p, + + # DEPRECATED + log_transform = NULL) { # Fix NOTEs . = NULL @@ -3071,7 +3240,9 @@ setGeneric("keep_variable", function(.data, if (is_present(log_transform) & !is.null(log_transform)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(log_transform = )", details = "The argument log_transform is now deprecated, please use transform.") + deprecate_warn("1.7.4", + "tidybulk::test_differential_abundance(log_transform = )", + details = "The argument log_transform is now deprecated, please use transform.") if(log_transform == TRUE) transform = log1p } @@ -3098,52 +3269,66 @@ setGeneric("keep_variable", function(.data, } #' keep_variable -#' @inheritParams keep_variable #' #' @docType methods #' @rdname keep_variable-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for +#' the statistics from the hypothesis test (e.g., log fold change, p-value +#' and false discovery rate). setMethod("keep_variable", "spec_tbl_df", .keep_variable) #' keep_variable -#' @inheritParams keep_variable #' #' @docType methods #' @rdname keep_variable-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("keep_variable", "tbl_df", .keep_variable) #' keep_variable -#' @inheritParams keep_variable #' #' @docType methods #' @rdname keep_variable-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("keep_variable", "tidybulk", .keep_variable) #' find abundant transcripts #' #' `r lifecycle::badge("maturing")` #' -#' @description identify_abundant() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description identify_abundant() takes as input A `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional columns for the statistics from the hypothesis test. #' #' @importFrom rlang enquo -#' #' @importFrom dplyr filter #' @importFrom tidyr drop_na #' #' @name identify_abundant #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param factor_of_interest The name of the column of the factor of interest. This is used for defining sample groups for the filtering process. It uses the filterByExpr function from edgeR. -#' @param minimum_counts A real positive number. It is the threshold of count per million that is used to filter transcripts/genes out from the scaling procedure. -#' @param minimum_proportion A real positive number between 0 and 1. It is the threshold of proportion of samples for each transcripts/genes that have to be characterised by a cmp bigger than the threshold to be included for scaling procedure. +#' @param factor_of_interest The name of the column of the factor of interest. +#' This is used for defining sample groups for the filtering process. It uses +#' the filterByExpr function from edgeR. +#' @param minimum_counts A real positive number. It is the threshold of count +#' per million that is used to filter transcripts/genes out from the scaling procedure. +#' @param minimum_proportion A real positive number between 0 and 1. It is the +#' threshold of proportion of samples for each transcripts/genes that have to +#' be characterised by a cmp bigger than the threshold to be included for +#' scaling procedure. #' #' @details At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) #' @@ -3155,15 +3340,12 @@ setMethod("keep_variable", "tidybulk", .keep_variable) #' min.prop = minimum_proportion #' ) #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). -#' -#' -#' +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). #' #' @examples #' -#' -#' #' identify_abundant( #' tidybulk::se_mini #' ) @@ -3183,14 +3365,13 @@ setGeneric("identify_abundant", function(.data, standardGeneric("identify_abundant")) # Set internal -.identify_abundant = function(.data, +.identify_abundant <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, factor_of_interest = NULL, minimum_counts = 10, - minimum_proportion = 0.7) -{ + minimum_proportion = 0.7) { # Fix NOTEs . = NULL @@ -3219,7 +3400,7 @@ setGeneric("identify_abundant", function(.data, } - if( ".abundant" %in% colnames(.data) ) return( .data |> reattach_internals(.data) ) + if(".abundant" %in% colnames(.data)) return(.data |> reattach_internals(.data)) # Check if package is installed, otherwise install @@ -3235,7 +3416,8 @@ setGeneric("identify_abundant", function(.data, !is.null(factor_of_interest) && !factor_of_interest |> quo_is_null() && !factor_of_interest |> quo_is_symbolic() - ) stop("tidybulk says: factor_of_interest must be symbolic (i.e. column name/s not surrounded by single or double quotes) and not a character.") + ) stop("tidybulk says: factor_of_interest must be symbolic (i.e. ", + "column name/s not surrounded by single or double quotes) and not a character.") if( @@ -3316,30 +3498,33 @@ setGeneric("identify_abundant", function(.data, } #' keep_abundant -#' @inheritParams identify_abundant #' #' @docType methods #' @rdname identify_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("identify_abundant", "spec_tbl_df", .identify_abundant) #' identify_abundant -#' @inheritParams identify_abundant #' #' @docType methods #' @rdname identify_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("identify_abundant", "tbl_df", .identify_abundant) #' identify_abundant -#' @inheritParams identify_abundant #' #' @docType methods #' @rdname identify_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("identify_abundant", "tidybulk", .identify_abundant) @@ -3347,21 +3532,34 @@ setMethod("identify_abundant", "tidybulk", .identify_abundant) #' #' \lifecycle{questioning} #' -#' @description keep_abundant() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description keep_abundant() takes as input A `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional columns for the statistics from the +#' hypothesis test. #' #' @importFrom rlang enquo -#' #' @importFrom dplyr filter #' #' @name keep_abundant #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param factor_of_interest The name of the column of the factor of interest. This is used for defining sample groups for the filtering process. It uses the filterByExpr function from edgeR. -#' @param minimum_counts A real positive number. It is the threshold of count per million that is used to filter transcripts/genes out from the scaling procedure. -#' @param minimum_proportion A real positive number between 0 and 1. It is the threshold of proportion of samples for each transcripts/genes that have to be characterised by a cmp bigger than the threshold to be included for scaling procedure. +#' @param factor_of_interest The name of the column of the factor of interest. +#' This is used for defining sample groups for the filtering process. It uses +#' the filterByExpr function from edgeR. +#' @param minimum_counts A real positive number. It is the threshold of count +#' per million that is used to filter transcripts/genes out from the scaling +#' procedure. +#' @param minimum_proportion A real positive number between 0 and 1. It is the +#' threshold of proportion of samples for each transcripts/genes that have to +#' be characterised by a cmp bigger than the threshold to be included for +#' scaling procedure. #' #' @details At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) #' @@ -3373,15 +3571,12 @@ setMethod("identify_abundant", "tidybulk", .identify_abundant) #' min.prop = minimum_proportion #' ) #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). -#' -#' +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). #' #' #' @examples -#' -#' -#' #' keep_abundant( #' tidybulk::se_mini #' ) @@ -3401,15 +3596,13 @@ setGeneric("keep_abundant", function(.data, standardGeneric("keep_abundant")) # Set internal -.keep_abundant = function(.data, - .sample = NULL, - .transcript = NULL, - .abundance = NULL, - factor_of_interest = NULL, - minimum_counts = 10, - minimum_proportion = 0.7) -{ - +.keep_abundant <- function(.data, + .sample = NULL, + .transcript = NULL, + .abundance = NULL, + factor_of_interest = NULL, + minimum_counts = 10, + minimum_proportion = 0.7) { # Fix NOTEs . = NULL @@ -3448,30 +3641,33 @@ setGeneric("keep_abundant", function(.data, } #' keep_abundant -#' @inheritParams keep_abundant #' #' @docType methods #' @rdname keep_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for +#' the statistics from the hypothesis test (e.g., log fold change, p-value +#' and false discovery rate). setMethod("keep_abundant", "spec_tbl_df", .keep_abundant) #' keep_abundant -#' @inheritParams keep_abundant #' #' @docType methods #' @rdname keep_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("keep_abundant", "tbl_df", .keep_abundant) #' keep_abundant -#' @inheritParams keep_abundant #' #' @docType methods #' @rdname keep_abundant-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("keep_abundant", "tidybulk", .keep_abundant) @@ -3480,29 +3676,48 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' #' `r lifecycle::badge("maturing")` #' -#' @description test_gene_enrichment() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` of gene set information +#' @description test_gene_enrichment() takes as input a `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` of gene set information #' #' @importFrom rlang enquo #' -#' #' @name test_gene_enrichment #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula with no response variable, representing the desired linear model +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula with no response variable, representing the +#' desired linear model #' @param .sample The name of the sample column #' @param .entrez The ENTREZ ID of the transcripts/genes #' @param .abundance The name of the transcript/gene abundance column -#' @param contrasts This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' @param methods A character vector. One or 3 or more methods to use in the testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() to see the supported GSE methods. -#' @param gene_sets A character vector or a list. It can take one or more of the following built-in collections as a character vector: c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. Alternatively, a list of user-supplied gene sets can be provided, to be used with EGSEA buildCustomIdx. In that case, each gene set is a character vector of Entrez IDs and the names of the list are the gene set names. +#' @param contrasts This parameter takes the format of the contrast parameter +#' of the method of choice. For edgeR and limma-voom is a character vector. +#' For DESeq2 is a list including a character vector of length three. The first +#' covariate is the one the model is tested against (e.g., ~ factor_of_interest) +#' @param methods A character vector. One or 3 or more methods to use in the +#' testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() to +#' see the supported GSE methods. +#' @param gene_sets A character vector or a list. It can take one or more of +#' the following built-in collections as a character vector: c("h", "c1", "c2", +#' "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", +#' "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. +#' Alternatively, a list of user-supplied gene sets can be provided, to be used +#' with EGSEA buildCustomIdx. In that case, each gene set is a character vector +#' of Entrez IDs and the names of the list are the gene set names. #' @param species A character. It can be human, mouse or rat. #' @param cores An integer. The number of cores available #' #' @param method DEPRECATED. Please use methods. -#' @param .contrasts DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) -#' -#' @details This wrapper executes ensemble gene enrichment analyses of the dataset using EGSEA (DOI:0.12688/f1000research.12544.1) +#' @param .contrasts DEPRECATED - This parameter takes the format of the +#' contrast parameter of the method of choice. For edgeR and limma-voom is a +#' character vector. For DESeq2 is a list including a character vector of length +#' three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest) #' +#' @details This wrapper executes ensemble gene enrichment analyses of the +#' dataset using EGSEA (DOI:0.12688/f1000research.12544.1) #' #' dge = #' data |> @@ -3556,7 +3771,8 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' .entrez = entrez, #' .abundance = count, #' methods = c("roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), -#' gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), +#' gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", +#' "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), #' species="human", #' cores = 2 #' ) @@ -3569,17 +3785,16 @@ setMethod("keep_abundant", "tidybulk", .keep_abundant) #' #' setGeneric("test_gene_enrichment", function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - contrasts = NULL, - methods = c("camera" , "roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), - species, - cores = 10, - - # DEPRECATED + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + contrasts = NULL, + methods = c("camera" , "roast" , "safe", "gage", "padog" , "globaltest", "ora"), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), + species, + cores = 10, + # DEPRECATED method = NULL, .contrasts = NULL ) @@ -3587,21 +3802,22 @@ setGeneric("test_gene_enrichment", function(.data, # Set internal #' @importFrom lifecycle deprecate_warn -.test_gene_enrichment = function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - contrasts = NULL, - methods = c("camera" , "roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), - species, - cores = 10, - - # DEPRECATED - method = NULL, - .contrasts = NULL - ) { +.test_gene_enrichment <- function( + .data, + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + contrasts = NULL, + methods = c("camera" , "roast" , "safe", "gage", "padog", "globaltest", "ora" ), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "kegg_disease", "kegg_metabolism", "kegg_signaling"), + species, + cores = 10, + # DEPRECATED + method = NULL, + .contrasts = NULL + ) { # Fix NOTEs . = NULL @@ -3610,7 +3826,8 @@ setGeneric("test_gene_enrichment", function(.data, if (is_present(method) & !is.null(method)) { # Signal the deprecation to the user - deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", details = "The argument method is now deprecated please use methods") + deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", + details = "The argument method is now deprecated please use methods") methods = method } @@ -3618,7 +3835,8 @@ setGeneric("test_gene_enrichment", function(.data, if (is_present(.contrasts) & !is.null(.contrasts)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", + details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") contrasts = .contrasts } @@ -3655,7 +3873,10 @@ setGeneric("test_gene_enrichment", function(.data, when( ".abundant" %in% colnames(.) ~ filter(., .abundant), ~ { - warning("tidybulk says: highly abundant transcripts were not identified (i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore this operation will be performed on unfiltered data. In rare occasions this could be wanted. In standard whole-transcriptome workflows is generally unwanted.") + warning("tidybulk says: highly abundant transcripts were not identified ", + "(i.e. identify_abundant()) or filtered (i.e., keep_abundant), therefore ", + "this operation will be performed on unfiltered data. In rare occasions this ", + "could be wanted. In standard whole-transcriptome workflows is generally unwanted.") (.) } ) |> @@ -3677,7 +3898,6 @@ setGeneric("test_gene_enrichment", function(.data, } #' test_gene_enrichment -#' @inheritParams test_gene_enrichment #' #' @docType methods #' @rdname test_gene_enrichment-methods @@ -3688,7 +3908,6 @@ setMethod("test_gene_enrichment", .test_gene_enrichment) #' test_gene_enrichment -#' @inheritParams test_gene_enrichment #' #' @docType methods #' @rdname test_gene_enrichment-methods @@ -3699,7 +3918,6 @@ setMethod("test_gene_enrichment", .test_gene_enrichment) #' test_gene_enrichment -#' @inheritParams test_gene_enrichment #' #' @docType methods #' @rdname test_gene_enrichment-methods @@ -3713,24 +3931,32 @@ setMethod("test_gene_enrichment", #' #' `r lifecycle::badge("maturing")` #' -#' @description test_gene_overrepresentation() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics +#' @description test_gene_overrepresentation() takes as input a `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics #' #' @importFrom rlang enquo #' @importFrom rlang quo_is_missing #' -#' #' @name test_gene_overrepresentation #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .entrez The ENTREZ ID of the transcripts/genes #' @param .do_test A boolean column name symbol. It indicates the transcript to check -#' @param species A character. For example, human or mouse. MSigDB uses the latin species names (e.g., \"Mus musculus\", \"Homo sapiens\") -#' @param gene_sets A character vector. The subset of MSigDB datasets you want to test against (e.g. \"C2\"). If NULL all gene sets are used (suggested). This argument was added to avoid time overflow of the examples. +#' @param species A character. For example, human or mouse. MSigDB uses the +#' latin species names (e.g., \"Mus musculus\", \"Homo sapiens\") +#' @param gene_sets A character vector. The subset of MSigDB datasets you want +#' to test against (e.g. \"C2\"). If NULL all gene sets are used (suggested). +#' This argument was added to avoid time overflow of the examples. #' #' @param gene_set DEPRECATED. Use gene_sets instead. #' -#' @details This wrapper execute gene enrichment analyses of the dataset using a list of transcripts and GSEA. +#' @details This wrapper execute gene enrichment analyses of the dataset using +#' a list of transcripts and GSEA. #' This wrapper uses clusterProfiler (DOI: doi.org/10.1089/omi.2011.0118) on the back-end. #' #' Undelying method: @@ -3785,7 +4011,7 @@ setGeneric("test_gene_overrepresentation", function(.data, standardGeneric("test_gene_overrepresentation")) # Set internal -.test_gene_overrepresentation = function(.data, +.test_gene_overrepresentation <- function(.data, .entrez, .do_test, species, @@ -3793,7 +4019,6 @@ setGeneric("test_gene_overrepresentation", function(.data, gene_sets = NULL, gene_set = NULL # DEPRECATED ) { - # Comply with CRAN NOTES . = NULL @@ -3808,7 +4033,7 @@ setGeneric("test_gene_overrepresentation", function(.data, stop("tidybulk says: the .entrez parameter appears to no be set") # Check column type - if (.data %>% mutate(my_do_test = !!.do_test) %>% pull(my_do_test) |> is("logical") |> not() ) + if (.data %>% mutate(my_do_test = !!.do_test) %>% pull(my_do_test) |> is("logical") |> not() ) stop("tidybulk says: .do_test column must be logical (i.e., TRUE or FALSE)") # Check packages msigdbr @@ -3820,7 +4045,8 @@ setGeneric("test_gene_overrepresentation", function(.data, # Check is correct species name if(species %in% msigdbr::msigdbr_species()$species_name |> not()) - stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species names (e.g., %s)", paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) + stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species names (e.g., %s)", + paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) .data |> #filter(!!.entrez %in% unique(m_df$entrez_gene)) |> @@ -3830,13 +4056,13 @@ setGeneric("test_gene_overrepresentation", function(.data, entrez_over_to_gsea(species, gene_collections = gene_sets ) |> # Add methods used - memorise_methods_used(c("clusterProfiler", "msigdbr", "msigdb"), object_containing_methods = .data) + memorise_methods_used(c("clusterProfiler", "msigdbr", "msigdb"), + object_containing_methods = .data) } #' test_gene_overrepresentation -#' @inheritParams test_gene_overrepresentation #' #' @docType methods #' @rdname test_gene_overrepresentation-methods @@ -3847,7 +4073,6 @@ setMethod("test_gene_overrepresentation", .test_gene_overrepresentation) #' test_gene_overrepresentation -#' @inheritParams test_gene_overrepresentation #' #' @docType methods #' @rdname test_gene_overrepresentation-methods @@ -3858,7 +4083,6 @@ setMethod("test_gene_overrepresentation", .test_gene_overrepresentation) #' test_gene_overrepresentation -#' @inheritParams test_gene_overrepresentation #' #' @docType methods #' @rdname test_gene_overrepresentation-methods @@ -3872,31 +4096,44 @@ setMethod("test_gene_overrepresentation", #' #' \lifecycle{maturing} #' -#' @description test_gene_rank() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics +#' @description test_gene_rank() takes as input a `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics #' #' @importFrom rlang enquo #' @importFrom rlang quo_is_missing #' -#' #' @name test_gene_rank #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' @param .entrez The ENTREZ ID of the transcripts/genes #' @param .arrange_desc A column name of the column to arrange in decreasing order -#' @param species A character. For example, human or mouse. MSigDB uses the latin species names (e.g., \"Mus musculus\", \"Homo sapiens\") -#' @param gene_sets A character vector or a list. It can take one or more of the following built-in collections as a character vector: c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. Alternatively, a list of user-supplied gene sets can be provided, to be used with EGSEA buildCustomIdx. In that case, each gene set is a character vector of Entrez IDs and the names of the list are the gene set names. +#' @param species A character. For example, human or mouse. MSigDB uses the +#' latin species names (e.g., \"Mus musculus\", \"Homo sapiens\") +#' @param gene_sets A character vector or a list. It can take one or more of +#' the following built-in collections as a character vector: c("h", "c1", "c2", +#' "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", +#' "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. +#' Alternatively, a list of user-supplied gene sets can be provided, to be used +#' with EGSEA buildCustomIdx. In that case, each gene set is a character vector +#' of Entrez IDs and the names of the list are the gene set names. #' #' @param gene_set DEPRECATED. Use gene_sets instead. #' -#' @details This wrapper execute gene enrichment analyses of the dataset using a list of transcripts and GSEA. +#' @details This wrapper execute gene enrichment analyses of the dataset using +#' a list of transcripts and GSEA. #' This wrapper uses clusterProfiler (DOI: doi.org/10.1089/omi.2011.0118) on the back-end. #' #' Undelying method: #'# Get gene sets signatures #'msigdbr::msigdbr(species = species) %>% #' -#' # Filter specific gene_sets if specified. This was introduced to speed up examples executionS +#' # Filter specific gene_sets if specified. This was introduced to speed up +#' examples executionS #' when( #' !is.null(gene_sets ) ~ filter(., gs_cat %in% gene_sets ), #' ~ (.) @@ -3918,8 +4155,6 @@ setMethod("test_gene_overrepresentation", #' @return A consistent object (to the input) #' #' -#' -#' #' @examples #' #' print("Not run for build time.") @@ -3957,7 +4192,7 @@ setGeneric("test_gene_rank", function(.data, standardGeneric("test_gene_rank")) # Set internal -.test_gene_rank = function(.data, +.test_gene_rank <- function(.data, .entrez, .arrange_desc, species, @@ -3973,7 +4208,8 @@ setGeneric("test_gene_rank", function(.data, if (is_present(gene_set) & !is.null(gene_set)) { # Signal the deprecation to the user - deprecate_warn("1.3.1", "tidybulk::test_gene_rank(gene_set = )", details = "The argument gene_set is now deprecated please use gene_sets.") + deprecate_warn("1.3.1", "tidybulk::test_gene_rank(gene_set = )", + details = "The argument gene_set is now deprecated please use gene_sets.") gene_sets = gene_set } @@ -3982,7 +4218,8 @@ setGeneric("test_gene_rank", function(.data, if (is_present(.sample) & !is.null(.sample)) { # Signal the deprecation to the user - deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", details = "The argument .sample is now deprecated and not needed anymore.") + deprecate_warn("1.13.2", "tidybulk::test_gene_rank(.sample = )", + details = "The argument .sample is now deprecated and not needed anymore.") } @@ -4046,7 +4283,6 @@ setGeneric("test_gene_rank", function(.data, } #' test_gene_rank -#' @inheritParams test_gene_rank #' #' @docType methods #' @rdname test_gene_rank-methods @@ -4057,7 +4293,6 @@ setMethod("test_gene_rank", .test_gene_rank) #' test_gene_rank -#' @inheritParams test_gene_rank #' #' @docType methods #' @rdname test_gene_rank-methods @@ -4068,7 +4303,6 @@ setMethod("test_gene_rank", .test_gene_rank) #' test_gene_rank -#' @inheritParams test_gene_rank #' #' @docType methods #' @rdname test_gene_rank-methods @@ -4083,17 +4317,23 @@ setMethod("test_gene_rank", #' #' `r lifecycle::badge("maturing")` #' -#' @description pivot_sample() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with only sample-related columns -#' +#' @description pivot_sample() takes as input a `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` with only sample-related columns #' #' #' @name pivot_sample #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .sample The name of the sample column #' #' -#' @details This functon extracts only sample-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. +#' @details This functon extracts only sample-related information for +#' downstream analysis (e.g., visualisation). It is disruptive in the sense +#' that it cannot be passed anymore to tidybulk function. #' #' @return A `tbl` with transcript-related information #' @@ -4139,7 +4379,6 @@ setGeneric("pivot_sample", function(.data, } #' pivot_sample -#' @inheritParams pivot_sample #' #' @docType methods #' @rdname pivot_sample-methods @@ -4149,7 +4388,6 @@ setMethod("pivot_sample", .pivot_sample) #' pivot_sample -#' @inheritParams pivot_sample #' #' @docType methods #' @rdname pivot_sample-methods @@ -4159,7 +4397,6 @@ setMethod("pivot_sample", .pivot_sample) #' pivot_sample -#' @inheritParams pivot_sample #' #' @docType methods #' @rdname pivot_sample-methods @@ -4172,17 +4409,24 @@ setMethod("pivot_sample", #' #' `r lifecycle::badge("maturing")` #' -#' @description pivot_transcript() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with only transcript-related columns -#' +#' @description pivot_transcript() takes as input a `tbl` (with at least three +#' columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a `tbl` with only +#' transcript-related columns #' #' #' @name pivot_transcript #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' @param .transcript The name of the transcript column #' #' -#' @details This functon extracts only transcript-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. +#' @details This functon extracts only transcript-related information for +#' downstream analysis (e.g., visualisation). It is disruptive in the sense +#' that it cannot be passed anymore to tidybulk function. #' #' @return A `tbl` with transcript-related information #' @@ -4228,7 +4472,6 @@ setGeneric("pivot_transcript", function(.data, } #' pivot_transcript -#' @inheritParams pivot_transcript #' #' @docType methods #' @rdname pivot_transcript-methods @@ -4238,7 +4481,6 @@ setMethod("pivot_transcript", .pivot_transcript) #' pivot_transcript -#' @inheritParams pivot_transcript #' #' @docType methods #' @rdname pivot_transcript-methods @@ -4248,7 +4490,6 @@ setMethod("pivot_transcript", .pivot_transcript) #' pivot_transcript -#' @inheritParams pivot_transcript #' #' @docType methods #' @rdname pivot_transcript-methods @@ -4262,11 +4503,14 @@ setMethod("pivot_transcript", #' #' \lifecycle{questioning} #' -#' @description fill_missing_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with new observations +#' @description fill_missing_abundance() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with new observations #' #' @importFrom rlang enquo #' -#' #' @name fill_missing_abundance #' #' @param .data A `tbl` formatted as | | | | <...> | @@ -4275,7 +4519,8 @@ setMethod("pivot_transcript", #' @param .abundance The name of the transcript abundance column #' @param fill_with A numerical abundance with which fill the missing data points #' -#' @details This function fills the abundance of missing sample-transcript pair using the median of the sample group defined by the formula +#' @details This function fills the abundance of missing sample-transcript +#' pair using the median of the sample group defined by the formula #' #' @return A consistent object (to the input) non-sparse abundance #' @@ -4337,7 +4582,6 @@ setGeneric("fill_missing_abundance", function(.data, } #' fill_missing_abundance -#' @inheritParams fill_missing_abundance #' #' @docType methods #' @rdname fill_missing_abundance-methods @@ -4346,7 +4590,6 @@ setGeneric("fill_missing_abundance", function(.data, setMethod("fill_missing_abundance", "spec_tbl_df", .fill_missing_abundance) #' fill_missing_abundance -#' @inheritParams fill_missing_abundance #' #' @docType methods #' @rdname fill_missing_abundance-methods @@ -4355,7 +4598,6 @@ setMethod("fill_missing_abundance", "spec_tbl_df", .fill_missing_abundance) setMethod("fill_missing_abundance", "tbl_df", .fill_missing_abundance) #' fill_missing_abundance -#' @inheritParams fill_missing_abundance #' #' @docType methods #' @rdname fill_missing_abundance-methods @@ -4369,22 +4611,35 @@ setMethod("fill_missing_abundance", "tidybulk", .fill_missing_abundance) #' #' `r lifecycle::badge("maturing")` #' -#' @description impute_missing_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional sample-transcript pairs with imputed transcript abundance. +#' @description impute_missing_abundance() takes as input A `tbl` (with at least +#' three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional sample-transcript pairs with imputed +#' transcript abundance. #' #' @importFrom rlang enquo #' -#' #' @name impute_missing_abundance #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch) +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula with no response variable, representing the +#' desired linear model where the first covariate is the factor of interest +#' and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch) #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param suffix A character string. This is added to the imputed count column names. If empty the count column are overwritten -#' @param force_scaling A boolean. In case a abundance-containing column is not scaled (columns with _scale suffix), setting force_scaling = TRUE will result in a scaling by library size, to compensating for a possible difference in sequencing depth. +#' @param suffix A character string. This is added to the imputed count column +#' names. If empty the count column are overwritten +#' @param force_scaling A boolean. In case a abundance-containing column is not +#' scaled (columns with _scale suffix), setting force_scaling = TRUE will result +#' in a scaling by library size, to compensating for a possible difference in +#' sequencing depth. #' -#' @details This function imputes the abundance of missing sample-transcript pair using the median of the sample group defined by the formula +#' @details This function imputes the abundance of missing sample-transcript +#' pair using the median of the sample group defined by the formula #' #' @return A consistent object (to the input) non-sparse abundance #' @@ -4417,7 +4672,7 @@ setGeneric("impute_missing_abundance", function(.data, standardGeneric("impute_missing_abundance")) # Set internal -.impute_missing_abundance = function(.data, +.impute_missing_abundance <- function(.data, .formula, .sample = NULL, .transcript = NULL, @@ -4467,7 +4722,6 @@ setGeneric("impute_missing_abundance", function(.data, } #' impute_missing_abundance -#' @inheritParams impute_missing_abundance #' #' @docType methods #' @rdname impute_missing_abundance-methods @@ -4476,7 +4730,6 @@ setGeneric("impute_missing_abundance", function(.data, setMethod("impute_missing_abundance", "spec_tbl_df", .impute_missing_abundance) #' impute_missing_abundance -#' @inheritParams impute_missing_abundance #' #' @docType methods #' @rdname impute_missing_abundance-methods @@ -4485,7 +4738,6 @@ setMethod("impute_missing_abundance", "spec_tbl_df", .impute_missing_abundance) setMethod("impute_missing_abundance", "tbl_df", .impute_missing_abundance) #' impute_missing_abundance -#' @inheritParams impute_missing_abundance #' #' @docType methods #' @rdname impute_missing_abundance-methods @@ -4500,25 +4752,41 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' #' `r lifecycle::badge("maturing")` #' -#' @description test_differential_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description test_differential_cellularity() takes as input A `tbl` (with at +#' least three columns for sample, feature and transcript abundance) or +#' `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional columns for the statistics from the hypothesis test. #' #' @importFrom rlang enquo #' @importFrom stringr str_detect #' #' @name test_differential_cellularity #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula representing the desired linear model. The formula can be of two forms: multivariable (recommended) or univariable Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". The dot represents cell-type proportions, and it is mandatory. If censored regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\" +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula representing the desired linear model. The formula +#' can be of two forms: multivariable (recommended) or univariable +#' Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". +#' The dot represents cell-type proportions, and it is mandatory. If censored +#' regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\" #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A string character. Either \"cibersort\", \"epic\" or \"llsr\". The regression method will be chosen based on being multivariable: lm or cox-regression (both on logit-transformed proportions); or univariable: beta or cox-regression (on logit-transformed proportions). See .formula for multi- or univariable choice. -#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance +#' @param method A string character. Either \"cibersort\", \"epic\" or \"llsr\". +#' The regression method will be chosen based on being multivariable: lm or +#' cox-regression (both on logit-transformed proportions); or univariable: +#' beta or cox-regression (on logit-transformed proportions). See .formula +#' for multi- or univariable choice. +#' @param reference A data frame. The transcript/cell_type data frame of +#' integer transcript abundance #' @param significance_threshold A real between 0 and 1 (usually 0.05). #' @param ... Further parameters passed to the method deconvolve_cellularity #' #' @details This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) -#' and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) +#' and passes the proportions inferred into a generalised linear model +#' (DOI:dx.doi.org/10.1007/s11749-010-0189-z) #' or a cox regression model (ISBN: 978-1-4757-3294-8) #' #' Underlying method for the generalised linear model: @@ -4546,7 +4814,9 @@ setMethod("impute_missing_abundance", "tidybulk", .impute_missing_abundance) #' mutate(.proportion_0_corrected = .proportion_0_corrected |> boot::logit()) %>% #' survival::coxph(.my_formula, .) #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for +#' the statistics from the hypothesis test (e.g., log fold change, p-value +#' and false discovery rate). #' #' #' @@ -4588,7 +4858,7 @@ setGeneric("test_differential_cellularity", function(.data, standardGeneric("test_differential_cellularity")) # Set internal -.test_differential_cellularity = function(.data, +.test_differential_cellularity <- function(.data, .formula, .sample = NULL, .transcript = NULL, @@ -4596,9 +4866,7 @@ setGeneric("test_differential_cellularity", function(.data, method = "cibersort", reference = X_cibersort, significance_threshold = 0.05, - ...) -{ - + ...) { # Fix NOTEs . = NULL @@ -4616,7 +4884,8 @@ setGeneric("test_differential_cellularity", function(.data, # Validate formula if(.formula |> format() |> str_detect(" \\.|\\. ", negate = TRUE)) - stop("tidybulk says: in the formula a dot must be present in either these forms \". ~\" or \"~ .\" with a white-space after or before respectively") + stop("tidybulk says: in the formula a dot must be present in either these ", + "forms \". ~\" or \"~ .\" with a white-space after or before respectively") test_differential_cellularity_( .data, @@ -4633,7 +4902,6 @@ setGeneric("test_differential_cellularity", function(.data, } #' test_differential_cellularity -#' @inheritParams test_differential_cellularity #' #' @docType methods #' @rdname test_differential_cellularity-methods @@ -4643,7 +4911,6 @@ setMethod("test_differential_cellularity", .test_differential_cellularity) #' test_differential_cellularity -#' @inheritParams test_differential_cellularity #' #' @docType methods #' @rdname test_differential_cellularity-methods @@ -4653,7 +4920,6 @@ setMethod("test_differential_cellularity", .test_differential_cellularity) #' test_differential_cellularity -#' @inheritParams test_differential_cellularity #' #' @docType methods #' @rdname test_differential_cellularity-methods @@ -4662,29 +4928,45 @@ setMethod("test_differential_cellularity", "tidybulk", .test_differential_cellularity) -#' Test of stratification of biological replicates based on tissue composition, one cell-type at the time, using Kaplan-meier curves. +#' Test of stratification of biological replicates based on tissue composition, +#' one cell-type at the time, using Kaplan-meier curves. #' #' `r lifecycle::badge("maturing")` #' -#' @description test_stratification_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +#' @description test_stratification_cellularity() takes as input A `tbl` +#' (with at least three columns for sample, feature and transcript abundance) +#' or `SummarizedExperiment` (more convenient if abstracted to tibble with +#' library(tidySummarizedExperiment)) and returns a consistent object +#' (to the input) with additional columns for the statistics from the hypothesis test. #' #' @importFrom rlang enquo -#' #' @importFrom stringr str_detect #' #' @name test_stratification_cellularity #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) -#' @param .formula A formula representing the desired linear model. The formula can be of two forms: multivariable (recommended) or univariable Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". The dot represents cell-type proportions, and it is mandatory. If censored regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\" +#' @param .data A `tbl` (with at least three columns for sample, feature and +#' transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .formula A formula representing the desired linear model. The formula +#' can be of two forms: multivariable (recommended) or univariable +#' Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". +#' The dot represents cell-type proportions, and it is mandatory. If censored +#' regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\" #' @param .sample The name of the sample column #' @param .transcript The name of the transcript/gene column #' @param .abundance The name of the transcript/gene abundance column -#' @param method A string character. Either \"cibersort\", \"epic\" or \"llsr\". The regression method will be chosen based on being multivariable: lm or cox-regression (both on logit-transformed proportions); or univariable: beta or cox-regression (on logit-transformed proportions). See .formula for multi- or univariable choice. -#' @param reference A data frame. The transcript/cell_type data frame of integer transcript abundance +#' @param method A string character. Either \"cibersort\", \"epic\" or \"llsr\". +#' The regression method will be chosen based on being multivariable: lm or +#' cox-regression (both on logit-transformed proportions); or univariable: beta +#' or cox-regression (on logit-transformed proportions). See .formula for +#' multi- or univariable choice. +#' @param reference A data frame. The transcript/cell_type data frame of integer +#' transcript abundance #' @param ... Further parameters passed to the method deconvolve_cellularity #' #' @details This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) -#' and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) +#' and passes the proportions inferred into a generalised linear model +#' (DOI:dx.doi.org/10.1007/s11749-010-0189-z) #' or a cox regression model (ISBN: 978-1-4757-3294-8) #' #' @@ -4701,14 +4983,12 @@ setMethod("test_differential_cellularity", #' mutate(.high_cellularity = .proportion > median(.proportion)) |> #' survival::survdiff(data = data, .my_formula) #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). -#' -#' -#' +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). #' + #' @examples -#' -#' #' tidybulk::se_mini |> #' test_stratification_cellularity( #' survival::Surv(days, dead) ~ ., @@ -4732,16 +5012,14 @@ setGeneric("test_stratification_cellularity", function(.data, standardGeneric("test_stratification_cellularity")) # Set internal -.test_stratification_cellularity = function(.data, +.test_stratification_cellularity <- function(.data, .formula, .sample = NULL, .transcript = NULL, .abundance = NULL, method = "cibersort", reference = X_cibersort, - ...) -{ - + ...) { # Fix NOTEs . = NULL @@ -4759,7 +5037,8 @@ setGeneric("test_stratification_cellularity", function(.data, # Validate formula if(.formula |> format() %>% str_detect(" \\.|\\. ", negate = TRUE)) - stop("tidybulk says: in the formula a dot must be present in either these forms \". ~\" or \"~ .\" with a white-space after or before respectively") + stop("tidybulk says: in the formula a dot must be present in either ", + "these forms \". ~\" or \"~ .\" with a white-space after or before respectively") test_stratification_cellularity_( .data, @@ -4775,7 +5054,6 @@ setGeneric("test_stratification_cellularity", function(.data, } #' test_stratification_cellularity -#' @inheritParams test_stratification_cellularity #' #' @docType methods #' @rdname test_stratification_cellularity-methods @@ -4785,7 +5063,6 @@ setMethod("test_stratification_cellularity", .test_stratification_cellularity) #' test_stratification_cellularity -#' @inheritParams test_stratification_cellularity #' #' @docType methods #' @rdname test_stratification_cellularity-methods @@ -4795,7 +5072,6 @@ setMethod("test_stratification_cellularity", .test_stratification_cellularity) #' test_stratification_cellularity -#' @inheritParams test_stratification_cellularity #' #' @docType methods #' @rdname test_stratification_cellularity-methods @@ -4814,25 +5090,25 @@ setMethod("test_stratification_cellularity", #' #' @importFrom rlang enquo #' -#' #' @name get_bibliography #' -#' @param .data A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +#' @param .data A `tbl` (with at least three columns for sample, feature +#' and transcript abundance) or `SummarizedExperiment` (more convenient if +#' abstracted to tibble with library(tidySummarizedExperiment)) #' -#' @details This methods returns the bibliography list of your workflow from the internals of a tidybulk object (attr(., "internals")) +#' @details This methods returns the bibliography list of your workflow from +#' the internals of a tidybulk object (attr(., "internals")) #' #' #' @examples -#' -#' #' get_bibliography(tidybulk::se_mini) #' #' -#' #' @docType methods #' @rdname get_bibliography-methods #' -#' @return NULL. It prints a list of bibliography references for the software used through the workflow. +#' @return NULL. It prints a list of bibliography references for the software +#' used through the workflow. #' @export #' setGeneric("get_bibliography", function(.data) @@ -4867,7 +5143,6 @@ setGeneric("get_bibliography", function(.data) } #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods @@ -4877,7 +5152,6 @@ setMethod("get_bibliography", .get_bibliography) #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods @@ -4887,7 +5161,6 @@ setMethod("get_bibliography", .get_bibliography) #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods @@ -4897,7 +5170,6 @@ setMethod("get_bibliography", .get_bibliography) #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods @@ -4915,7 +5187,8 @@ setMethod("get_bibliography", #' @importFrom rlang quo_is_null #' #' @param tbl A tibble -#' @param rownames The column name of the input tibble that will become the rownames of the output matrix +#' @param rownames The column name of the input tibble that will become +#' the rownames of the output matrix #' @param do_check A boolean #' #' @return A matrix @@ -4935,7 +5208,6 @@ as_matrix <- function(tbl, rownames = enquo(rownames) tbl %>% - # Through warning if data frame is not numerical beside the rownames column (if present) ifelse_pipe( do_check && @@ -4948,7 +5220,8 @@ as_matrix <- function(tbl, unique() %>% `%in%`(c("numeric", "integer")) |> not() |> any(), ~ { - warning("tidybulk says: there are NON-numerical columns, the matrix will NOT be numerical") + warning("tidybulk says: there are NON-numerical columns, ", + "the matrix will NOT be numerical") .x } ) |> diff --git a/R/methods_SE.R b/R/methods_SE.R index d3aeac8d..cba366f0 100755 --- a/R/methods_SE.R +++ b/R/methods_SE.R @@ -50,8 +50,6 @@ #' @export #' #' -#' @inheritParams tidybulk -#' #' @docType methods #' @rdname tidybulk-methods #' @@ -63,8 +61,6 @@ setMethod("tidybulk", "SummarizedExperiment", .tidybulk_se) #' #' @export #' -#' @inheritParams tidybulk -#' #' @docType methods #' @rdname tidybulk-methods #' @@ -110,7 +106,8 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) if (is_present(reference_selection_function) & !is.null(reference_selection_function)) { # Signal the deprecation to the user - deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()") + deprecate_warn("1.1.8", "tidybulk::scale_abundance(reference_selection_function = )", + details = "The argument reference_selection_function is now deprecated please use reference_sample. By default the reference selection function is max()") } @@ -158,8 +155,9 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) ) # Communicate the reference if chosen by default - if(is.null(reference_sample)) message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference)) - + if(is.null(reference_sample)) { + message(sprintf("tidybulk says: the sample with largest library size %s was chosen as reference for scaling", reference)) + } # Calculate TMM nf <- edgeR::calcNormFactors( @@ -205,12 +203,11 @@ setMethod("tidybulk", "RangedSummarizedExperiment", .tidybulk_se) memorise_methods_used(c("edger", "tmm")) %>% # Attach column internals - add_tt_columns(.abundance_scaled = !!(function(x, v) enquo(v))(x,!!as.symbol(value_scaled))) + add_tt_columns(.abundance_scaled = !!(function(x, v) enquo(v))(x,!!as.symbol(value_scaled))) } #' scale_abundance -#' @inheritParams scale_abundance #' #' @docType methods #' @rdname scale_abundance-methods @@ -222,7 +219,6 @@ setMethod("scale_abundance", .scale_abundance_se) #' scale_abundance -#' @inheritParams scale_abundance #' #' @docType methods #' @rdname scale_abundance-methods @@ -343,7 +339,6 @@ setMethod("scale_abundance", } #' quantile_normalise_abundance -#' @inheritParams quantile_normalise_abundance #' #' @docType methods #' @rdname quantile_normalise_abundance-methods @@ -355,7 +350,6 @@ setMethod("quantile_normalise_abundance", .quantile_normalise_abundance_se) #' quantile_normalise_abundance -#' @inheritParams quantile_normalise_abundance #' #' @docType methods #' @rdname quantile_normalise_abundance-methods @@ -389,7 +383,9 @@ setMethod("quantile_normalise_abundance", method %>% when( (.) == "kmeans" ~ get_clusters_kmeans_bulk_SE, - (.) == "SNN" ~ stop("tidybulk says: Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable."), #get_clusters_SNN_bulk_SE, + (.) == "SNN" ~ stop("tidybulk says: Matrix package (v1.3-3) causes an ", + "error with Seurat::FindNeighbors used in this method. We are trying to ", + "solve this issue. At the moment this option in unaviable."), #get_clusters_SNN_bulk_SE, ~ stop("tidybulk says: the only supported methods are \"kmeans\" or \"SNN\" ") ) @@ -423,7 +419,6 @@ setMethod("quantile_normalise_abundance", } #' cluster_elements -#' @inheritParams cluster_elements #' #' @docType methods #' @rdname cluster_elements-methods @@ -435,7 +430,6 @@ setMethod("cluster_elements", .cluster_elements_se) #' cluster_elements -#' @inheritParams cluster_elements #' #' @importFrom rlang inform #' @@ -542,7 +536,8 @@ setMethod("cluster_elements", # Communicate the attribute added { - rlang::inform(sprintf("tidybulk says: to access the raw results do `attr(..., \"internals\")$%s`", method), .frequency_id = sprintf("Access %s results", method), .frequency = "always") + rlang::inform(sprintf("tidybulk says: to access the raw results do `attr(..., \"internals\")$%s`", method), + .frequency_id = sprintf("Access %s results", method), .frequency = "always") (.) } @@ -551,7 +546,6 @@ setMethod("cluster_elements", } #' reduce_dimensions -#' @inheritParams reduce_dimensions #' #' @docType methods #' @rdname reduce_dimensions-methods @@ -563,7 +557,6 @@ setMethod("reduce_dimensions", .reduce_dimensions_se) #' reduce_dimensions -#' @inheritParams reduce_dimensions #' #' @docType methods #' @rdname reduce_dimensions-methods @@ -648,7 +641,6 @@ setMethod("reduce_dimensions", } #' rotate_dimensions -#' @inheritParams rotate_dimensions #' #' @docType methods #' @rdname rotate_dimensions-methods @@ -660,7 +652,6 @@ setMethod("rotate_dimensions", .rotate_dimensions_se) #' rotate_dimensions -#' @inheritParams rotate_dimensions #' #' @docType methods #' @rdname rotate_dimensions-methods @@ -672,7 +663,7 @@ setMethod("rotate_dimensions", .rotate_dimensions_se) -.remove_redundancy_se = function(.data, +.remove_redundancy_se <- function(.data, .element = NULL, .feature = NULL, .abundance = NULL, @@ -747,7 +738,8 @@ setMethod("rotate_dimensions", ) } , ~ stop( - "tidybulk says: method must be either \"correlation\" for dropping correlated elements or \"reduced_dimension\" to drop the closest pair according to two dimensions (e.g., PCA)" + "tidybulk says: method must be either \"correlation\" for dropping ", + "correlated elements or \"reduced_dimension\" to drop the closest pair according to two dimensions (e.g., PCA)" ) ) @@ -763,13 +755,14 @@ setMethod("rotate_dimensions", when( method == "correlation" ~ memorise_methods_used(., "widyr"), method == "reduced_dimensions" ~ (.), - ~ stop("tidybulk says: method must be either \"correlation\" for dropping correlated elements or \"reduced_dimension\" to drop the closest pair according to two dimensions (e.g., PCA)") + ~ stop("tidybulk says: method must be either \"correlation\" for ", + "dropping correlated elements or \"reduced_dimension\" to drop ", + "the closest pair according to two dimensions (e.g., PCA)") ) } #' remove_redundancy -#' @inheritParams remove_redundancy #' #' @docType methods #' @rdname remove_redundancy-methods @@ -781,7 +774,6 @@ setMethod("remove_redundancy", .remove_redundancy_se) #' remove_redundancy -#' @inheritParams remove_redundancy #' #' @importFrom rlang quo #' @@ -795,26 +787,19 @@ setMethod("remove_redundancy", .remove_redundancy_se) -.adjust_abundance_se = function(.data, - +.adjust_abundance_se <- function(.data, # DEPRECATED .formula = NULL, - .factor_unwanted = NULL, .factor_of_interest = NULL, - .abundance = NULL, - method = "combat_seq", - - ..., # DEPRECATED transform = NULL, inverse_transform = NULL ) { - # Fix NOTEs . = NULL @@ -836,7 +821,9 @@ setMethod("remove_redundancy", ) { # Signal the deprecation to the user - deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", details = "The argument transform and inverse_transform is now deprecated, please use method argument instead specifying \"combat\" or \"combat_seq\".") + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(transform = )", + details = "The argument transform and inverse_transform is now deprecated, + please use method argument instead specifying \"combat\" or \"combat_seq\".") } @@ -848,12 +835,17 @@ setMethod("remove_redundancy", if (is_present(.formula) & !is.null(.formula)) { # Signal the deprecation to the user - deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", details = "The argument .formula is now deprecated, please use factor_unwanted and factor_of_interest. Using the formula, the first factor is of interest and the second is unwanted") + deprecate_warn("1.11.6", "tidybulk::test_differential_abundance(.formula = )", + details = "The argument .formula is now deprecated, + please use factor_unwanted and factor_of_interest. + Using the formula, the first factor is of interest and the second is unwanted") # Check that .formula includes at least two covariates if (parse_formula(.formula) %>% length %>% st(2)) stop( - "The .formula must contain two covariates, the first being the factor of interest, the second being the factor of unwanted variation" + "The .formula must contain two covariates, + the first being the factor of interest, + the second being the factor of unwanted variation" ) # Check that .formula includes no more than two covariates at the moment @@ -873,7 +865,9 @@ setMethod("remove_redundancy", # Create design matrix design = model.matrix( - object = as.formula(sprintf("~ %s", colData(.data) |> as_tibble() |> select(!!.factor_of_interest) |> colnames() |> str_c(collapse = '+'))), + object = as.formula(sprintf("~ %s", colData(.data) |> as_tibble() |> + select(!!.factor_of_interest) |> + colnames() |> str_c(collapse = '+'))), # get first argument of the .formula data = colData(.data) ) @@ -895,14 +889,14 @@ setMethod("remove_redundancy", .data |> assay(my_assay) |> # Check if log transform is needed log1p() %>% - # Add little noise to avoid all 0s for a covariate that would error combat code (not statistics that would be fine) + # Add little noise to avoid all 0s for a covariate that would error combat + # code (not statistics that would be fine) `+` (rnorm(length(.), 0, 0.000001)) for(i in colnames(my_batch)){ my_assay_adjusted = my_assay_adjusted %>% - # Run combat sva::ComBat( batch = my_batch[,i] |> pull(1), @@ -914,12 +908,11 @@ setMethod("remove_redundancy", # Tranfrom back my_assay_adjusted = - my_assay_adjusted %>% - expm1() |> - apply(2, pmax, 0) - + my_assay_adjusted |> + expm1() |> + (\(x) apply(x, 2, pmax, 0))() } - else if(tolower(method) == "combat_seq"){ + else if (tolower(method) == "combat_seq"){ my_assay_adjusted = .data %>% @@ -941,7 +934,9 @@ setMethod("remove_redundancy", unwanted_covariate_matrix = model.matrix( - object = as.formula(sprintf("~ 0 + %s", colData(.data) |> as_tibble() |> select(!!.factor_unwanted) |> colnames() |> str_c(collapse = '+'))), + object = as.formula(sprintf("~ 0 + %s", colData(.data) |> as_tibble() |> + select(!!.factor_unwanted) |> colnames() |> + str_c(collapse = '+'))), # get first argument of the .formula data = colData(.data) ) @@ -959,7 +954,8 @@ setMethod("remove_redundancy", apply(2, pmax, 0) } else { - stop("tidybulk says: the argument \"method\" must be \"combat_seq\", \"combat\", or \"limma_remove_batch_effect\"") + stop("tidybulk says: the argument \"method\" must be \"combat_seq\", + \"combat\", or \"limma_remove_batch_effect\"") } @@ -981,7 +977,6 @@ setMethod("remove_redundancy", } #' adjust_abundance -#' @inheritParams adjust_abundance #' #' @docType methods #' @rdname adjust_abundance-methods @@ -993,7 +988,6 @@ setMethod("adjust_abundance", .adjust_abundance_se) #' adjust_abundance -#' @inheritParams adjust_abundance #' #' @docType methods #' @rdname adjust_abundance-methods @@ -1010,14 +1004,12 @@ setMethod("adjust_abundance", #' @importFrom SummarizedExperiment SummarizedExperiment #' @importFrom GenomicRanges makeGRangesListFromDataFrame #' @importFrom dplyr setdiff -.aggregate_duplicates_se = function(.data, - +.aggregate_duplicates_se <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, aggregation_function = sum, keep_integer = TRUE) { - # Fix NOTEs . = NULL @@ -1025,12 +1017,19 @@ setMethod("adjust_abundance", .transcript = enquo(.transcript) - if(quo_is_null(.transcript)) stop("tidybulk says: using SummarizedExperiment with aggregate_duplicates, you need to specify .transcript parameter. It should be a feature-wise column (e.g. gene symbol) that you want to collapse he features with (e.g. ensembl). It cannot be the representation of rownames(SummarizedExperiment), as those are unique by definition, and not part of rowData per-se.") + if(quo_is_null(.transcript)) stop("tidybulk says: using SummarizedExperiment + with aggregate_duplicates, you need to specify .transcript parameter. + It should be a feature-wise column (e.g. gene symbol) that you want to + collapse he features with (e.g. ensembl). It cannot be the representation + of rownames(SummarizedExperiment), as those are unique by definition, + and not part of rowData per-se.") if(!quo_name(.transcript) %in% colnames( .data %>% rowData())) - stop("tidybulk says: the .transcript argument must be a feature-wise column names. The feature-wise information can be found with rowData()") + stop("tidybulk says: the .transcript argument must be a feature-wise column + names. The feature-wise information can be found with rowData()") if(!is.null(.sample) | !is.null(.abundance)) - warning("tidybulk says: for SummarizedExperiment objects only the argument .transcript (feature ID to collapse) is considered") + warning("tidybulk says: for SummarizedExperiment objects only the argument + .transcript (feature ID to collapse) is considered") collapse_function = function(x){ x %>% unique() %>% paste(collapse = "___") } @@ -1079,7 +1078,8 @@ setMethod("adjust_abundance", # If no duplicate exit if(!nrow(new_row_data) is.na() |> which() |> length() |> gt(0)) - stop(sprintf("tidybulk says: you have some %s that are NAs", quo_name(.transcript))) + stop(sprintf("tidybulk says: you have some %s that are NAs", + quo_name(.transcript))) .x = combineByRow(.x, aggregation_function) .x = .x[match(new_row_data[,quo_name(.transcript)], rownames(.x)),,drop=FALSE] @@ -1127,9 +1128,12 @@ setMethod("adjust_abundance", # Through warning if there are logicals of factor in the data frame # because they cannot be merged if they are not unique - if (length(non_standard_columns)>0 & new_range_data %>% pull(!!.transcript) %>% duplicated() %>% which() %>% length() %>% gt(0) ) { + if (length(non_standard_columns)>0 & new_range_data %>% + pull(!!.transcript) %>% duplicated() %>% which() %>% length() %>% gt(0) ) { warning(paste(capture.output({ - cat(crayon::blue("tidybulk says: If duplicates exist from the following columns, only the first instance was taken (lossy behaviour), as aggregating those classes with concatenation is not possible.\n")) + cat(crayon::blue("tidybulk says: If duplicates exist from the following + columns, only the first instance was taken (lossy behaviour), + as aggregating those classes with concatenation is not possible.\n")) print(rowData(.data)[1,non_standard_columns,drop=FALSE]) }), collapse = "\n")) } @@ -1173,7 +1177,6 @@ setMethod("adjust_abundance", } #' aggregate_duplicates -#' @inheritParams aggregate_duplicates #' #' @docType methods #' @rdname aggregate_duplicates-methods @@ -1185,7 +1188,6 @@ setMethod("aggregate_duplicates", .aggregate_duplicates_se) #' aggregate_duplicates -#' @inheritParams aggregate_duplicates #' #' @docType methods #' @rdname aggregate_duplicates-methods @@ -1199,12 +1201,11 @@ setMethod("aggregate_duplicates", #' @importFrom rlang quo_is_symbolic -.deconvolve_cellularity_se = function(.data, +.deconvolve_cellularity_se <- function(.data, reference = X_cibersort, method = "cibersort", prefix = "", ...) { - # Fix NOTEs . = NULL @@ -1242,13 +1243,15 @@ setMethod("aggregate_duplicates", # Check if package is installed, otherwise install if (find.package("class", quiet = TRUE) %>% length %>% equals(0)) { message("Installing class needed for Cibersort") - install.packages("class", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports")) + install.packages("class", repos = "https://cloud.r-project.org", + dependencies = c("Depends", "Imports")) } # Check if package is installed, otherwise install if (find.package("e1071", quiet = TRUE) %>% length %>% equals(0)) { message("Installing e1071 needed for Cibersort") - install.packages("e1071", repos = "https://cloud.r-project.org", dependencies = c("Depends", "Imports")) + install.packages("e1071", repos = "https://cloud.r-project.org", + dependencies = c("Depends", "Imports")) } # Check if package is installed, otherwise install @@ -1266,7 +1269,8 @@ setMethod("aggregate_duplicates", # Validate reference validate_signature_SE(., reference) - do.call(my_CIBERSORT, list(Y = ., X = reference, QN=FALSE) %>% c(dots_args)) %$% + do.call(my_CIBERSORT, list(Y = ., X = reference, QN=FALSE) %>% + c(dots_args)) %$% proportions %>% as_tibble(rownames = quo_name(.sample)) %>% select(-`P-value`,-Correlation,-RMSE) @@ -1307,7 +1311,9 @@ setMethod("aggregate_duplicates", } if(method %in% c("mcp_counter", "quantiseq", "xcell") & !"immunedeconv" %in% (.packages())) - stop("tidybulk says: for xcell, mcp_counter, or quantiseq deconvolution you should have the package immunedeconv attached. Please execute library(immunedeconv)") + stop("tidybulk says: for xcell, mcp_counter, + or quantiseq deconvolution you should have the package + immunedeconv attached. Please execute library(immunedeconv)") (.) %>% deconvolute(method %>% tolower, tumor = FALSE) %>% @@ -1342,7 +1348,6 @@ setMethod("aggregate_duplicates", } #' deconvolve_cellularity -#' @inheritParams deconvolve_cellularity #' #' @docType methods #' @rdname deconvolve_cellularity-methods @@ -1354,7 +1359,6 @@ setMethod("deconvolve_cellularity", .deconvolve_cellularity_se) #' deconvolve_cellularity -#' @inheritParams deconvolve_cellularity #' #' @importFrom rlang inform #' @@ -1380,8 +1384,7 @@ setMethod( scaling_method = "TMM", omit_contrast_in_colnames = FALSE, prefix = "", - ...) -{ + ...) { .abundance = enquo(.abundance) @@ -1392,7 +1395,8 @@ setMethod( if (is_present(.contrasts) & !is.null(.contrasts)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", + details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") contrasts = .contrasts } @@ -1402,7 +1406,8 @@ setMethod( tidybulk says: All testing methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated. Therefore, it is essential to add covariates such as batch effects (if applicable) in the formula. -=====================================", .frequency_id = "All testing methods use raw counts", .frequency = "once") +=====================================", + .frequency_id = "All testing methods use raw counts", .frequency = "once") # Test test_above_log2_fold_change @@ -1412,7 +1417,8 @@ such as batch effects (if applicable) in the formula. # Filter abundant if performed .data = filter_if_abundant_were_identified(.data) - if(tolower(method) %in% c("edger_quasi_likelihood", "edger_likelihood_ratio", "edger_robust_likelihood_ratio")) + if(tolower(method) %in% c("edger_quasi_likelihood", "edger_likelihood_ratio", + "edger_robust_likelihood_ratio")) my_differential_abundance = get_differential_transcript_abundance_bulk_SE( .data, @@ -1476,7 +1482,10 @@ such as batch effects (if applicable) in the formula. ... ) else - stop("tidybulk says: the only methods supported at the moment are \"edgeR_quasi_likelihood\" (i.e., QLF), \"edgeR_likelihood_ratio\" (i.e., LRT), \"limma_voom\", \"limma_voom_sample_weights\", \"DESeq2\", \"glmmseq_lme4\", \"glmmseq_glmmTMB\"") + stop("tidybulk says: the only methods supported at the moment are + \"edgeR_quasi_likelihood\" (i.e., QLF), \"edgeR_likelihood_ratio\" + (i.e., LRT), \"limma_voom\", \"limma_voom_sample_weights\", \"DESeq2\", + \"glmmseq_lme4\", \"glmmseq_glmmTMB\"") # If action is get just return the statistics if(action == "get") return(my_differential_abundance$result) @@ -1495,18 +1504,24 @@ such as batch effects (if applicable) in the formula. # Add bibliography when( - tolower(method) == "edger_likelihood_ratio" ~ (.) %>% memorise_methods_used(c("edger", "edgeR_likelihood_ratio")), - tolower(method) == "edger_quasi_likelihood" ~ (.) %>% memorise_methods_used(c("edger", "edgeR_quasi_likelihood")), - tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% memorise_methods_used(c("edger", "edger_robust_likelihood_ratio")), + tolower(method) == "edger_likelihood_ratio" ~ (.) %>% + memorise_methods_used(c("edger", "edgeR_likelihood_ratio")), + tolower(method) == "edger_quasi_likelihood" ~ (.) %>% + memorise_methods_used(c("edger", "edgeR_quasi_likelihood")), + tolower(method) == "edger_robust_likelihood_ratio" ~ (.) %>% + memorise_methods_used(c("edger", "edger_robust_likelihood_ratio")), tolower(method) == "limma_voom" ~ (.) %>% memorise_methods_used("voom"), - tolower(method) == "limma_voom_sample_weights" ~ (.) %>% memorise_methods_used("voom_sample_weights"), + tolower(method) == "limma_voom_sample_weights" ~ (.) %>% + memorise_methods_used("voom_sample_weights"), tolower(method) == "deseq2" ~ (.) %>% memorise_methods_used("deseq2"), - tolower(method) %in% c("glmmseq_lme4", "glmmseq_glmmtmb") ~ (.) %>% memorise_methods_used("glmmseq"), + tolower(method) %in% c("glmmseq_lme4", "glmmseq_glmmtmb") ~ (.) %>% + memorise_methods_used("glmmseq"), ~ stop("tidybulk says: method not supported") ) %>% when( - !is.null(test_above_log2_fold_change) ~ (.) %>% memorise_methods_used("treat"), + !is.null(test_above_log2_fold_change) ~ (.) %>% + memorise_methods_used("treat"), ~ (.) ) %>% @@ -1514,7 +1529,10 @@ such as batch effects (if applicable) in the formula. # Communicate the attribute added { - rlang::inform(sprintf("tidybulk says: to access the raw results (fitted GLM) do `attr(..., \"internals\")$%s`", method), .frequency_id = sprintf("Access DE results %s", method), .frequency = "always") + rlang::inform(sprintf("tidybulk says: to access the raw results (fitted GLM) + do `attr(..., \"internals\")$%s`", method), + .frequency_id = sprintf("Access DE results %s", method), + .frequency = "always") (.) } @@ -1524,7 +1542,6 @@ such as batch effects (if applicable) in the formula. } #' test_differential_abundance -#' @inheritParams test_differential_abundance #' #' @docType methods #' @rdname test_differential_abundance-methods @@ -1538,7 +1555,6 @@ setMethod( ) #' test_differential_abundance -#' @inheritParams test_differential_abundance #' #' @docType methods #' @rdname test_differential_abundance-methods @@ -1584,7 +1600,6 @@ setMethod( } #' keep_variable -#' @inheritParams keep_variable #' #' @docType methods #' @rdname keep_variable-methods @@ -1596,7 +1611,6 @@ setMethod("keep_variable", .keep_variable_se) #' keep_variable -#' @inheritParams keep_variable #' #' @importFrom purrr map_chr #' @importFrom tidyr unite @@ -1611,15 +1625,13 @@ setMethod("keep_variable", "RangedSummarizedExperiment", .keep_variable_se) -.identify_abundant_se = function(.data, +.identify_abundant_se <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, factor_of_interest = NULL, minimum_counts = 10, - minimum_proportion = 0.7) -{ - + minimum_proportion = 0.7) { # Fix NOTEs . = NULL @@ -1632,7 +1644,9 @@ setMethod("keep_variable", !is.null(factor_of_interest) && !factor_of_interest |> quo_is_null() && !factor_of_interest |> quo_is_symbolic() - ) stop("tidybulk says: factor_of_interest must be symbolic (i.e. column name/s not surrounded by single or double quotes) and not a character.") + ) stop("tidybulk says: factor_of_interest must be symbolic + (i.e. column name/s not surrounded by single or double quotes) + and not a character.") # Check factor_of_interest @@ -1641,7 +1655,8 @@ setMethod("keep_variable", quo_is_symbolic(factor_of_interest) && (quo_names(factor_of_interest) %in% colnames(colData(.data)) |> all() %>% not()) ) - stop(sprintf("tidybulk says: the column %s is not present in colData", quo_names(factor_of_interest))) + stop(sprintf("tidybulk says: the column %s is not present in colData", + quo_names(factor_of_interest))) if (minimum_counts < 0) stop("The parameter minimum_counts must be > 0") @@ -1651,7 +1666,8 @@ setMethod("keep_variable", # If column is present use this instead of doing more work if(".abundant" %in% colnames(colData(.data))){ - message("tidybulk says: the column .abundant already exists in colData. Nothing was done") + message("tidybulk says: the column .abundant already exists in colData. + Nothing was done") # Return return(.data) @@ -1685,7 +1701,8 @@ setMethod("keep_variable", unlist() %in% c("numeric", "integer", "double") |> any() ) - stop("tidybulk says: The factor(s) of interest must not include continuous variables (e.g., integer,numeric, double).") + stop("tidybulk says: The factor(s) of interest must not include continuous + variables (e.g., integer,numeric, double).") string_factor_of_interest = colData(.data)[, factor_of_interest, drop=FALSE] |> @@ -1743,7 +1760,6 @@ setMethod("keep_variable", #' identify_abundant -#' @inheritParams identify_abundant #' #' @docType methods #' @rdname identify_abundant-methods @@ -1755,7 +1771,6 @@ setMethod("identify_abundant", .identify_abundant_se) #' identify_abundant -#' @inheritParams identify_abundant #' #' @docType methods #' @rdname identify_abundant-methods @@ -1801,7 +1816,6 @@ setMethod("identify_abundant", } #' keep_abundant -#' @inheritParams keep_abundant #' #' @docType methods #' @rdname keep_abundant-methods @@ -1813,7 +1827,6 @@ setMethod("keep_abundant", .keep_abundant_se) #' keep_abundant -#' @inheritParams keep_abundant #' #' @docType methods #' @rdname keep_abundant-methods @@ -1832,21 +1845,23 @@ setMethod("keep_abundant", #' #' #' -.test_gene_enrichment_SE = function(.data, - .formula, - .sample = NULL, - .entrez, - .abundance = NULL, - contrasts = NULL, - methods = c("camera" , "roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), - species, - cores = 10, - - # DEPRECATED - method = NULL, - .contrasts = NULL - ) { +.test_gene_enrichment_SE <- function( + .data, + .formula, + .sample = NULL, + .entrez, + .abundance = NULL, + contrasts = NULL, + methods = c("camera", "roast", "safe", "gage", "padog", "globaltest","ora"), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "kegg_disease", "kegg_metabolism", "kegg_signaling"), + species, + cores = 10, + + # DEPRECATED + method = NULL, + .contrasts = NULL + ) { # Fix NOTEs . = NULL @@ -1855,7 +1870,8 @@ setMethod("keep_abundant", if (is_present(method) & !is.null(method)) { # Signal the deprecation to the user - deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", details = "The argument method is now deprecated please use methods") + deprecate_warn("1.3.2", "tidybulk::test_gene_enrichment(method = )", + details = "The argument method is now deprecated please use methods") methods = method } @@ -1863,7 +1879,8 @@ setMethod("keep_abundant", if (is_present(.contrasts) & !is.null(.contrasts)) { # Signal the deprecation to the user - deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") + deprecate_warn("1.7.4", "tidybulk::test_differential_abundance(.contrasts = )", + details = "The argument .contrasts is now deprecated please use contrasts (without the dot).") contrasts = .contrasts } @@ -1883,7 +1900,8 @@ setMethod("keep_abundant", # Check if duplicated entrez if(rowData(.data)[,quo_name(.entrez)] %>% duplicated() %>% any()) - stop("tidybulk says: There are duplicated .entrez IDs. Please use aggregate_duplicates(.transcript = entrez).") + stop("tidybulk says: There are duplicated .entrez IDs. + Please use aggregate_duplicates(.transcript = entrez).") # For use within when .my_data = .data @@ -1924,12 +1942,14 @@ setMethod("keep_abundant", # Check if package is installed, otherwise install if (find.package("EGSEA", quiet = TRUE) %>% length %>% equals(0)) { stop(" - EGSEA not installed. Please install it. EGSEA require manual installation for not overwelming the user in case it is not needed. - BiocManager::install(\"EGSEA\", ask = FALSE) + EGSEA not installed. Please install it. EGSEA require manual installation + for not overwelming the user in case it is not needed. + BiocManager::install(\"EGSEA\", ask = FALSE) ") } if (!"EGSEA" %in% (.packages())) { - stop("EGSEA package not loaded. Please run library(\"EGSEA\"). With this setup, EGSEA require manual loading, for technical reasons.") + stop("EGSEA package not loaded. Please run library(\"EGSEA\"). + With this setup, EGSEA require manual loading, for technical reasons.") } dge = @@ -1999,7 +2019,8 @@ setMethod("keep_abundant", } - idx = buildIdx(entrezIDs = rownames(dge), species = species, msigdb.gsets = msigdb.gsets, + idx = buildIdx(entrezIDs = rownames(dge), species = species, + msigdb.gsets = msigdb.gsets, kegg.exclude = kegg.exclude) # Due to a bug with kegg pathview overlays, this collection is run without report @@ -2052,7 +2073,9 @@ setMethod("keep_abundant", } if (length(kegg_genesets) != 0) { - message("tidybulk says: due to a bug in the call to KEGG database (http://supportupgrade.bioconductor.org/p/122172/#122218), the analysis for this database is run without report production.") + message("tidybulk says: due to a bug in the call to KEGG database + (http://supportupgrade.bioconductor.org/p/122172/#122218), + the analysis for this database is run without report production.") res_kegg = dge %>% @@ -2100,7 +2123,6 @@ setMethod("keep_abundant", } #' test_gene_enrichment -#' @inheritParams test_gene_enrichment #' #' @docType methods #' @rdname test_gene_enrichment-methods @@ -2111,7 +2133,6 @@ setMethod("test_gene_enrichment", .test_gene_enrichment_SE) #' test_gene_enrichment -#' @inheritParams test_gene_enrichment #' #' @docType methods #' @rdname test_gene_enrichment-methods @@ -2139,7 +2160,8 @@ setMethod("test_gene_enrichment", if (is_present(gene_set) & !is.null(gene_set)) { # Signal the deprecation to the user - deprecate_warn("1.3.1", "tidybulk::.test_gene_overrepresentation(gene_set = )", details = "The argument gene_set is now deprecated please use gene_sets.") + deprecate_warn("1.3.1", "tidybulk::.test_gene_overrepresentation(gene_set = )", + details = "The argument gene_set is now deprecated please use gene_sets.") gene_sets = gene_set } @@ -2156,7 +2178,8 @@ setMethod("test_gene_enrichment", stop("tidybulk says: the .entrez parameter appears to no be set") # Check column type - if (.data %>% rowData() %>% as_tibble(rownames = f_(.data)$name) %>% mutate(my_do_test = !!.do_test) %>% pull(my_do_test) |> is("logical") %>% not()) + if (.data %>% rowData() %>% as_tibble(rownames = f_(.data)$name) %>% + mutate(my_do_test = !!.do_test) %>% pull(my_do_test) |> is("logical") %>% not()) stop("tidybulk says: .do_test column must be logical (i.e., TRUE or FALSE)") # Check packages msigdbr @@ -2168,7 +2191,9 @@ setMethod("test_gene_enrichment", # Check is correct species name if(species %in% msigdbr::msigdbr_species()$species_name %>% not()) - stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species names (e.g., %s)", paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) + stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species + names (e.g., %s)", paste(msigdbr::msigdbr_species()$species_name, + collapse=", "))) # # Check if missing entrez # if(.data %>% filter(!!.entrez %>% is.na) %>% nrow() %>% gt(0) ){ @@ -2184,13 +2209,13 @@ setMethod("test_gene_enrichment", entrez_over_to_gsea(species, gene_collections = gene_sets) %>% # Add methods used - memorise_methods_used(c("clusterProfiler", "msigdbr", "msigdb"), object_containing_methods = .data) + memorise_methods_used(c("clusterProfiler", "msigdbr", "msigdb"), + object_containing_methods = .data) } #' test_gene_overrepresentation -#' @inheritParams test_gene_overrepresentation #' #' @docType methods #' @rdname test_gene_overrepresentation-methods @@ -2201,7 +2226,6 @@ setMethod("test_gene_overrepresentation", .test_gene_overrepresentation_SE) #' test_gene_overrepresentation -#' @inheritParams test_gene_overrepresentation #' #' @docType methods #' @rdname test_gene_overrepresentation-methods @@ -2229,7 +2253,8 @@ setMethod("test_gene_overrepresentation", if (is_present(gene_set) & !is.null(gene_set)) { # Signal the deprecation to the user - deprecate_warn("1.3.1", "tidybulk::test_gene_rank(gene_set = )", details = "The argument gene_set is now deprecated please use gene_sets.") + deprecate_warn("1.3.1", "tidybulk::test_gene_rank(gene_set = )", + details = "The argument gene_set is now deprecated please use gene_sets.") gene_sets = gene_set } @@ -2255,7 +2280,9 @@ setMethod("test_gene_overrepresentation", # Check is correct species name if(species %in% msigdbr::msigdbr_species()$species_name %>% not()) - stop(sprintf("tidybulk says: wrong species name. MSigDB uses the latin species names (e.g., %s)", paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) + stop(sprintf("tidybulk says: wrong species name. MSigDB uses the + latin species names (e.g., %s)", + paste(msigdbr::msigdbr_species()$species_name, collapse=", "))) .data %>% pivot_transcript() %>% @@ -2265,7 +2292,8 @@ setMethod("test_gene_overrepresentation", entrez_rank_to_gsea(species, gene_collections = gene_sets)%>% # Add methods used. It is here and not in functions because I need the original .data - memorise_methods_used(c("clusterProfiler", "enrichplot"), object_containing_methods = .data) %>% + memorise_methods_used(c("clusterProfiler", "enrichplot"), + object_containing_methods = .data) %>% when( gene_sets %>% is("character") ~ (.) %>% memorise_methods_used("msigdbr"), ~ (.) @@ -2275,7 +2303,6 @@ setMethod("test_gene_overrepresentation", } #' test_gene_rank -#' @inheritParams test_gene_rank #' #' @docType methods #' @rdname test_gene_rank-methods @@ -2286,7 +2313,6 @@ setMethod("test_gene_rank", .test_gene_rank_SE) #' test_gene_rank -#' @inheritParams test_gene_rank #' #' @docType methods #' @rdname test_gene_rank-methods @@ -2322,7 +2348,6 @@ setMethod("test_gene_rank", } #' pivot_sample -#' @inheritParams pivot_sample #' #' @docType methods #' @rdname pivot_sample-methods @@ -2333,7 +2358,6 @@ setMethod("pivot_sample", .pivot_sample) #' pivot_sample -#' @inheritParams pivot_sample #' #' @docType methods #' @rdname pivot_sample-methods @@ -2379,7 +2403,6 @@ setMethod("pivot_sample", } #' pivot_transcript -#' @inheritParams pivot_transcript #' #' @docType methods #' @rdname pivot_transcript-methods @@ -2390,7 +2413,6 @@ setMethod("pivot_transcript", .pivot_transcript) #' pivot_transcript -#' @inheritParams pivot_transcript #' #' @docType methods #' @rdname pivot_transcript-methods @@ -2401,14 +2423,13 @@ setMethod("pivot_transcript", .pivot_transcript) -.impute_missing_abundance_se = function(.data, +.impute_missing_abundance_se <- function(.data, .formula, .sample = NULL, .transcript = NULL, .abundance = NULL, suffix = "", force_scaling = FALSE) { - # Fix NOTEs . = NULL @@ -2435,7 +2456,13 @@ setMethod("pivot_transcript", library_size = colSums(.x, na.rm = TRUE) .x = .x / library_size } - else message(sprintf("tidybulk says: %s appears not to be scaled for sequencing depth (missing _scaled suffix; if you think this column is idependent of sequencing depth ignore this message), therefore the imputation can produce non meaningful results if sequencing depth for samples are highly variable. If you use force_scaling = TRUE library size will be used for eliminatig some sequencig depth effect before imputation", .y)) + else message(sprintf("tidybulk says: %s appears not to be scaled for + sequencing depth (missing _scaled suffix; if you think this column is + idependent of sequencing depth ignore this message), therefore the + imputation can produce non meaningful results if sequencing depth for + samples are highly variable. If you use force_scaling = TRUE library + size will be used for eliminatig some sequencig depth effect before + imputation", .y)) # Log need_log = max(.x, na.rm=TRUE) > 50 @@ -2484,7 +2511,6 @@ setMethod("pivot_transcript", #' impute_missing_abundance -#' @inheritParams impute_missing_abundance #' #' @docType methods #' @rdname impute_missing_abundance-methods @@ -2499,7 +2525,6 @@ setMethod("impute_missing_abundance", .impute_missing_abundance_se) #' impute_missing_abundance -#' @inheritParams impute_missing_abundance #' #' @docType methods #' @rdname impute_missing_abundance-methods @@ -2515,12 +2540,11 @@ setMethod("impute_missing_abundance", -.test_differential_cellularity_se = function(.data, +.test_differential_cellularity_se <- function(.data, .formula, method = "cibersort", reference = X_cibersort, - ...) -{ + ...) { # Fix NOTEs . = NULL @@ -2567,7 +2591,8 @@ setMethod("impute_missing_abundance", .formula %>% when( # If I have the dot, needed definitely for censored - format(.) %>% grepl("\\.", .) %>% any ~ format(.) %>% str_replace("([-\\+\\*~ ]?)(\\.)", "\\1.proportion_0_corrected"), + format(.) %>% grepl("\\.", .) %>% any ~ format(.) %>% + str_replace("([-\\+\\*~ ]?)(\\.)", "\\1.proportion_0_corrected"), # If normal formula ~ sprintf(".proportion_0_corrected%s", format(.)) @@ -2577,16 +2602,17 @@ setMethod("impute_missing_abundance", # Test univariable_differential_tissue_composition_SE(deconvoluted, - method, - .my_formula, - min_detected_proportion) %>% + method, + .my_formula, + min_detected_proportion) %>% # Attach attributes reattach_internals(.data) %>% # Add methods used when( - grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot")), + grepl("Surv", .my_formula) ~ (.) %>% + memorise_methods_used(c("survival", "boot")), ~ (.) %>% memorise_methods_used("betareg") ) }, @@ -2622,15 +2648,16 @@ setMethod("impute_missing_abundance", # Test multivariable_differential_tissue_composition_SE(deconvoluted, - method, - .my_formula, - min_detected_proportion) %>% + method, + .my_formula, + min_detected_proportion) %>% # Attach attributes reattach_internals(.data) %>% # Add methods used - when(grepl("Surv", .my_formula) ~ (.) %>% memorise_methods_used(c("survival", "boot")), + when(grepl("Surv", .my_formula) ~ (.) %>% + memorise_methods_used(c("survival", "boot")), ~ (.)) }) %>% @@ -2642,7 +2669,6 @@ setMethod("impute_missing_abundance", } #' test_differential_cellularity -#' @inheritParams test_differential_cellularity #' #' @docType methods #' @rdname test_differential_cellularity-methods @@ -2656,7 +2682,6 @@ setMethod( ) #' test_differential_cellularity -#' @inheritParams test_differential_cellularity #' #' @docType methods #' @rdname test_differential_cellularity-methods @@ -2671,22 +2696,22 @@ setMethod( # Set internal #' @importFrom stringr str_replace -.test_stratification_cellularity_SE = function(.data, +.test_stratification_cellularity_SE <- function(.data, .formula, .sample = NULL, .transcript = NULL, .abundance = NULL, method = "cibersort", reference = X_cibersort, - ...) -{ + ...) { # Fix NOTEs . = NULL # Validate formula if(.formula %>% format() %>% grepl(" \\.|\\. ", .) %>% not) - stop("tidybulk says: in the formula a dot must be present in either these forms \". ~\" or \"~ .\" with a white-space after or before respectively") + stop("tidybulk says: in the formula a dot must be present in either these + forms \". ~\" or \"~ .\" with a white-space after or before respectively") deconvoluted = .data %>% @@ -2726,48 +2751,50 @@ setMethod( } #' test_stratification_cellularity -#' @inheritParams test_stratification_cellularity #' #' @docType methods #' @rdname test_stratification_cellularity-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("test_stratification_cellularity", "SummarizedExperiment", .test_stratification_cellularity_SE) #' test_stratification_cellularity -#' @inheritParams test_stratification_cellularity #' #' @docType methods #' @rdname test_stratification_cellularity-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("test_stratification_cellularity", "RangedSummarizedExperiment", .test_stratification_cellularity_SE) - - #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("get_bibliography", "SummarizedExperiment", .get_bibliography) #' get_bibliography -#' @inheritParams get_bibliography #' #' @docType methods #' @rdname get_bibliography-methods #' -#' @return A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +#' @return A consistent object (to the input) with additional columns for the +#' statistics from the hypothesis test (e.g., log fold change, p-value and +#' false discovery rate). setMethod("get_bibliography", "RangedSummarizedExperiment", .get_bibliography) @@ -2777,7 +2804,6 @@ setMethod("get_bibliography", #' @importFrom SummarizedExperiment rowData #' @importFrom tibble enframe #' -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods @@ -2785,7 +2811,7 @@ setMethod("get_bibliography", #' @return A `SummarizedExperiment` object #' #' -.describe_transcript_SE = function(.data, +.describe_transcript_SE <- function(.data, .transcript = NULL) { # Fix NOTEs @@ -2869,16 +2895,16 @@ setMethod("get_bibliography", } #' describe_transcript -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods #' -#' @return A consistent object (to the input) including additional columns for transcript symbol -setMethod("describe_transcript", "SummarizedExperiment", .describe_transcript_SE) +#' @return A consistent object (to the input) including additional +#' columns for transcript symbol +setMethod("describe_transcript", "SummarizedExperiment", + .describe_transcript_SE) #' describe_transcript -#' @inheritParams describe_transcript #' #' @docType methods #' @rdname describe_transcript-methods diff --git a/R/tidyr_methods.R b/R/tidyr_methods.R index 0065c08d..7a6dfa8b 100755 --- a/R/tidyr_methods.R +++ b/R/tidyr_methods.R @@ -1,62 +1,20 @@ -#' unnest -#' -#' @importFrom tidyr unnest -#' -#' @param data A tbl. (See tidyr) -#' @param cols <[`tidy-select`][tidyr_tidy_select]> Columns to unnest. -#' If you `unnest()` multiple columns, parallel entries must be of -#' compatibble sizes, i.e. they're either equal or length 1 (following the -#' standard tidyverse recycling rules). -#' @param ... <[`tidy-select`][tidyr_tidy_select]> Columns to nest, specified -#' using name-variable pairs of the form `new_col=c(col1, col2, col3)`. -#' The right hand side can be any valid tidy select expression. -#' -#' \Sexpr[results=rd, stage=render]{lifecycle::badge("deprecated")}: -#' previously you could write `df %>% nest(x, y, z)` and `df %>% -#' unnest(x, y, z)`. Convert to `df %>% nest(data=c(x, y, z))`. -#' and `df %>% unnest(c(x, y, z))`. -#' -#' If you previously created new variable in `unnest()` you'll now need to -#' do it explicitly with `mutate()`. Convert `df %>% unnest(y=fun(x, y, z))` -#' to `df %>% mutate(y=fun(x, y, z)) %>% unnest(y)`. -#' @param names_sep If `NULL`, the default, the names will be left -#' as is. In `nest()`, inner names will come from the former outer names; -#' in `unnest()`, the new outer names will come from the inner names. -#' -#' If a string, the inner and outer names will be used together. In `nest()`, -#' the names of the new outer columns will be formed by pasting together the -#' outer and the inner column names, separated by `names_sep`. In `unnest()`, -#' the new inner names will have the outer names (+ `names_sep`) automatically -#' stripped. This makes `names_sep` roughly symmetric between nesting and unnesting. -#' @param keep_empty See tidyr::unnest -#' @param names_repair See tidyr::unnest -#' @param ptype See tidyr::unnest -#' @param .drop See tidyr::unnest -#' @param .id tidyr::unnest -#' @param .sep tidyr::unnest -#' @param .preserve See tidyr::unnest -#' -#' -#' @return A tidySummarizedExperiment objector a tibble depending on input +#' @name unnest +#' @rdname unnest +#' @inherit tidyr::unnest +#' @return `tidySingleCellExperiment` #' #' @examples +#' data(se_mini) +#' se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) #' -#' -#' tidybulk::se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) -#' -#' @rdname nest-methods -#' @name unnest -#' -#' @export -NULL - +#' @importFrom tidyr unnest +#' @importFrom rlang enquo #' @export -unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, ptype=NULL, names_sep=NULL, names_repair="check_unique", .drop, .id, .sep, .preserve) -{ +unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, + ptype=NULL, names_sep=NULL, names_repair="check_unique", + .drop, .id, .sep, .preserve) { cols <- enquo(cols) - - data %>% drop_class(c("nested_tidybulk", "tt")) %>% tidyr::unnest(!!cols, ..., keep_empty = keep_empty, ptype = ptype, @@ -68,41 +26,24 @@ unnest.nested_tidybulk <- function (data, cols, ..., keep_empty=FALSE, ptype=NUL # Add class add_class("tt") %>% add_class("tidybulk") - } -#' nest -#' -#' @importFrom tidyr nest -#' -#' @param .data A tbl. (See tidyr) -#' @param ... Name-variable pairs of the form new_col = c(col1, col2, col3) (See tidyr) -#' -#' @return A tt object -#' -#' @examples -#' -#' tidybulk::se_mini %>% tidybulk() %>% nest( data = -.feature) -#' -#' @rdname nest-methods #' @name nest +#' @rdname nest +#' @inherit tidyr::nest #' -#' @export -NULL - -#' @importFrom rlang enquos +#' @examples +#' data(se_mini) +#' se_mini %>% tidybulk() %>% nest(data = -.feature) #' +#' @importFrom tidyr nest +#' @importFrom rlang enquo #' @export -#' -#' -#' -nest.tidybulk <- function (.data, ..., .names_sep = NULL) -{ +nest.tidybulk <- function (.data, ..., .names_sep = NULL) { cols <- enquos(...) col_name_data = names(cols) .data %>% - # This is needed otherwise nest goes into loop and fails drop_class(c("tidybulk", "tt")) %>% tidyr::nest(...) %>% @@ -121,5 +62,4 @@ nest.tidybulk <- function (.data, ..., .names_sep = NULL) # Add class add_class("tt") %>% add_class("nested_tidybulk") - } diff --git a/R/utilities.R b/R/utilities.R index ad09fc19..467f6232 100755 --- a/R/utilities.R +++ b/R/utilities.R @@ -1110,7 +1110,12 @@ do_validate = function(){ #' #' @importFrom stringr str_remove #' @importFrom stringr str_replace_all -#' +#' @importFrom betareg betareg +#' @importFrom broom tidy +#' @importFrom boot logit +#' @importFrom survival coxph +#' @importFrom survminer surv_fit +#' @importFrom survminer ggsurvplot multivariable_differential_tissue_composition = function( deconvoluted, method, @@ -1133,16 +1138,6 @@ multivariable_differential_tissue_composition = function( # Beta or Cox when( grepl("Surv", .my_formula) %>% any ~ { - # Check if package is installed, otherwise install - if (find.package("survival", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing betareg needed for analyses") - install.packages("survival", repos = "https://cloud.r-project.org") - } - - if (find.package("boot", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing boot needed for analyses") - install.packages("boot", repos = "https://cloud.r-project.org") - } (.) %>% survival::coxph(.my_formula, .) %>% @@ -1201,16 +1196,6 @@ univariable_differential_tissue_composition = function( .x %>% when( grepl("Surv", .my_formula) %>% any ~ { - # Check if package is installed, otherwise install - if (find.package("survival", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing betareg needed for analyses") - install.packages("survival", repos = "https://cloud.r-project.org") - } - - if (find.package("boot", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing boot needed for analyses") - install.packages("boot", repos = "https://cloud.r-project.org") - } (.) %>% mutate(.proportion_0_corrected = .proportion_0_corrected %>% boot::logit()) %>% @@ -1219,11 +1204,7 @@ univariable_differential_tissue_composition = function( select(-term) } , ~ { - # Check if package is installed, otherwise install - if (find.package("betareg", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing betareg needed for analyses") - install.packages("betareg", repos = "https://cloud.r-project.org") - } + (.) %>% betareg::betareg(.my_formula, .) %>% broom::tidy() %>% @@ -1245,24 +1226,6 @@ univariable_differential_tissue_stratification = function( .my_formula ){ - # Check if package is installed, otherwise install - if (find.package("survival", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing survival needed for analyses") - install.packages("survival", repos = "https://cloud.r-project.org") - } - - # Check if package is installed, otherwise install - if (find.package("survminer", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing survminer needed for analyses") - install.packages("survminer", repos = "https://cloud.r-project.org") - } - - - if (find.package("broom", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing broom needed for analyses") - install.packages("broom", repos = "https://cloud.r-project.org") - } - deconvoluted %>% # Test @@ -1317,30 +1280,17 @@ univariable_differential_tissue_stratification = function( unnest(surv_test, keep_empty = TRUE) } +#' @importFrom tidyr nest +#' @importFrom survival survdiff +#' @importFrom survminer surv_fit +#' @importFrom survminer ggsurvplot +#' @importFrom broom tidy univariable_differential_tissue_stratification_SE = function( deconvoluted, method, .my_formula ){ - # Check if package is installed, otherwise install - if (find.package("survival", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing survival needed for analyses") - install.packages("survival", repos = "https://cloud.r-project.org") - } - - # Check if package is installed, otherwise install - if (find.package("survminer", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing survminer needed for analyses") - install.packages("survminer", repos = "https://cloud.r-project.org") - } - - - if (find.package("broom", quiet = TRUE) %>% length %>% equals(0)) { - message("Installing broom needed for analyses") - install.packages("broom", repos = "https://cloud.r-project.org") - } - deconvoluted %>% pivot_sample() %>% diff --git a/R/validation.R b/R/validation.R index 462d181c..5530d394 100755 --- a/R/validation.R +++ b/R/validation.R @@ -16,7 +16,7 @@ #' #' @return A tbl #' -check_if_wrong_input = function(.data, list_input, expected_type) { +check_if_wrong_input <- function(.data, list_input, expected_type) { # Do the check if (list_input %>% map( ~ .x %>% class() %>% `[` (1)) %>% @@ -119,11 +119,12 @@ check_if_attribute_present = function(.data) { "tt_columns" %in% (.data %>% attr("internals") %>% names) } -eliminate_sparse_transcripts = function(.data, .transcript){ +eliminate_sparse_transcripts <- function(.data, .transcript){ # Parse column names .transcript = enquo(.transcript) - - warning("tidybulk says: Some transcripts have been omitted from the analysis because not present in every sample.") + + warning("tidybulk says: Some transcripts have been omitted from the analysis ", + "because not present in every sample.") .data %>% add_count(!!.transcript, name = "my_n") %>% @@ -132,7 +133,6 @@ eliminate_sparse_transcripts = function(.data, .transcript){ } check_if_data_rectangular = function(.data, .sample, .transcript, .abundance){ - # Parse column names .sample = enquo(.sample) .transcript = enquo(.transcript) @@ -150,15 +150,15 @@ check_if_data_rectangular = function(.data, .sample, .transcript, .abundance){ } -warning_if_data_is_not_rectangular = function(.data, .sample, .transcript, .abundance){ - +warning_if_data_is_not_rectangular <- function(.data, .sample, .transcript, .abundance) { # Parse column names .sample = enquo(.sample) .transcript = enquo(.transcript) .abundance = enquo(.abundance) if(!check_if_data_rectangular(.data, !!.sample, !!.transcript, !!.abundance)) - warning("tidybulk says: the data does not have the same number of transcript per sample. The data set is not rectangular.") + warning("tidybulk says: the data does not have the same number of transcript ", + "per sample. The data set is not rectangular.") } @@ -170,14 +170,17 @@ error_if_data_is_not_rectangular = function(.data, .sample, .transcript, .abunda .abundance = enquo(.abundance) if(!check_if_data_rectangular(.data, !!.sample, !!.transcript, !!.abundance)) - stop("tidybulk says: the data must have the same number of transcript per sample. Check again that you have not filtered single observations accidentally. If you have missing data you can use fill_missing_abundance() or impute_missing_abundance()") + stop("tidybulk says: the data must have the same number of transcript per sample. ", + "Check again that you have not filtered single observations accidentally. ", + "If you have missing data you can use fill_missing_abundance() or impute_missing_abundance()") } tidybulk_to_tbl = function(.data) { .data %>% drop_class(c("tidybulk", "tt")) } -validation_default = function(.data, +#'@export +validation_default <- function(.data, .sample, .transcript, .abundance, @@ -193,11 +196,14 @@ validation_default = function(.data, if (type == "hard" & !is_missing) stop( - "tidybulk says: One or more columns that should include sample identifier, transcript identified or transcript abundance are missing from your data frame." + "tidybulk says: One or more columns that should include sample identifier, ", + "transcript identified or transcript abundance are missing from your data frame." ) if (type == "soft" & !is_missing) { warning( - "tidybulk says: One or more columns that should include sample identifier, transcript identified or transcript abundance are missing from your data frame. The tidybulk object has been converted to a `tbl`" + "tidybulk says: One or more columns that should include sample identifier, ", + "transcript identified or transcript abundance are missing from your data frame. ", + "The tidybulk object has been converted to a `tbl`" ) return(.data %>% tidybulk_to_tbl) } @@ -207,11 +213,13 @@ validation_default = function(.data, if (type == "hard" & !is_type) stop( - "tidybulk says: The column provided as .sample .transcript or .abundance do not comply with the required types (, , )." + "tidybulk says: The column provided as .sample .transcript or .abundance do not ", + "comply with the required types (, , )." ) if (type == "soft" & !is_type) { warning( - "tidybulk says: The column provided as .sample .transcript or .abundance do not comply with the required types. The tidybulk object has been converted to a `tbl`" + "tidybulk says: The column provided as .sample .transcript or .abundance do not ", + "comply with the required types. The tidybulk object has been converted to a `tbl`" ) return(.data %>% tidybulk_to_tbl) } @@ -262,9 +270,10 @@ validation <- function(.data, UseMethod("validation", .data) } -validation.default = validation_default +validation.default <- validation_default -validation.tidybulk = function(.data, +#'@export +validation.tidybulk <- function(.data, .sample = NULL, .transcript = NULL, .abundance = NULL, @@ -275,11 +284,16 @@ validation.tidybulk = function(.data, if (type == "hard" & !is_attr) stop( - "tidybulk says: The object provided has tidybulk class but no attribute containing the column names (attr(., \"internals\")). You must have used an external function that eliminated the attributes. Insert a valid tidybulk object or provide `.sample`, `.transcript`, `.abundance` column names as arguments " + "tidybulk says: The object provided has tidybulk class but no attribute ", + "containing the column names (attr(., \"internals\")). You must have used ", + "an external function that eliminated the attributes. Insert a valid ", + "tidybulk object or provide `.sample`, `.transcript`, `.abundance` column names as arguments " ) if (type == "soft" & !is_attr) { warning( - "tidybulk says: The object provided has tidybulk class but no attribute containing the column names (attr(., \"internals\")). You must have used an external function that eliminated the attributes. The tidybulk object has been converted to a `tbl`" + "tidybulk says: The object provided has tidybulk class but no attribute ", + "containing the column names (attr(., \"internals\")). You must have used an ", + "external function that eliminated the attributes. The tidybulk object has been converted to a `tbl`" ) return(.data %>% tidybulk_to_tbl) } @@ -312,12 +326,14 @@ validate_signature = function(.data, reference, .transcript){ if(length(overlapping_genes) == 0 ) stop(sprintf( - "\ntidybulk says: You have NO genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", paste(rownames(reference)[1:10], collapse = ", ") + "\ntidybulk says: You have NO genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", + paste(rownames(reference)[1:10], collapse = ", ") )) if ( length(overlapping_genes) %>% st(50) ) warning(sprintf( - "\ntidybulk says: You have less than 50 genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", paste(rownames(reference)[1:10], collapse = ", ") + "\ntidybulk says: You have less than 50 genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", + paste(rownames(reference)[1:10], collapse = ", ") )) # Check if rownames exist @@ -327,18 +343,19 @@ validate_signature = function(.data, reference, .transcript){ } -validate_signature_SE = function(assay, reference){ - +validate_signature_SE <- function(assay, reference) { overlapping_genes = (rownames(assay) %in% rownames(reference)) %>% which if(length(overlapping_genes) == 0 ) stop(sprintf( - "\ntidybulk says: You have NO genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", paste(rownames(reference)[1:10], collapse = ", ") + "\ntidybulk says: You have NO genes in common between the query data and ", + "the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", paste(rownames(reference)[1:10], collapse = ", ") )) if ( length(overlapping_genes) %>% st(50) ) warning(sprintf( - "\ntidybulk says: You have less than 50 genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", paste(rownames(reference)[1:10], collapse = ", ") + "\ntidybulk says: You have less than 50 genes in common between the query data and the reference data. Please check again your input dataframes\nthe genes in the reference look like this %s", + paste(rownames(reference)[1:10], collapse = ", ") )) # Check if rownames exist diff --git a/R/zzz.R b/R/zzz.R index cfa9fcb7..170b9430 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -3,24 +3,25 @@ version = packageDescription(pkgname, fields = "Version") msg = paste0("======================================== -", pkgname, " version ", version, " -If you use TIDYBULK in published research, please cite: - -Mangiola et al. tidybulk: an R tidy framework for modular -transcriptomic data analysis. Genome Biology 2021. - -This message can be suppressed by: - suppressPackageStartupMessages(library(tidybulk)) -======================================== -") + ", pkgname, " version ", version, " + If you use TIDYBULK in published research, please cite: + + Mangiola et al. tidybulk: an R tidy framework for modular + transcriptomic data analysis. Genome Biology 2021. + + This message can be suppressed by: + suppressPackageStartupMessages(library(tidybulk)) + ======================================== + ") - packageStartupMessage(msg) + # Attach tidyverse + attached <- tidyverse_attach() } -rv = R.Version() - -if(getRversion() >= "4.0.0" && as.numeric(rv$`svn rev`) >= 77889) { - unitType = get("unitType", envir = asNamespace("grid")) -} else { - unitType = function(x, recurse = TRUE) attr(x, "unit") -} \ No newline at end of file +# rv = R.Version() +# +# if(getRversion() >= "4.0.0" && as.numeric(rv$`svn rev`) >= 77889) { +# unitType = get("unitType", envir = asNamespace("grid")) +# } else { +# unitType = function(x, recurse = TRUE) attr(x, "unit") +# } \ No newline at end of file diff --git a/README.Rmd b/README.Rmd index 7a0aae20..6d01d506 100755 --- a/README.Rmd +++ b/README.Rmd @@ -1,10 +1,11 @@ --- -title: "tidybulk - part of tidyTranscriptomics" +title: "tidybulk - part of _tidyomics_" output: github_document --- -[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) [![R build status](https://github.com/stemangiola/tidybulk/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/stemangiola/tidybulk/actions) +[![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) [![R build status](https://github.com/stemangiola/tidybulk/workflows/rworkflows/badge.svg)](https://github.com/stemangiola/tidybulk/actions) + ```{r echo=FALSE} @@ -19,50 +20,61 @@ The code is released under the version 3 of the GNU General Public License. knitr::include_graphics("man/figures/logo.png") ``` -website: [stemangiola.github.io/tidybulk/](http://stemangiola.github.io/tidybulk/) -[Third party tutorials](https://rstudio-pubs-static.s3.amazonaws.com/792462_f948e766b15d4ee5be5c860493bda0b3.html) -Please have a look also to - -- [tidySummarizedExperiment](https://github.com/stemangiola/tidySummarizedExperiment) for bulk data tidy representation -- [tidySingleCellExperiment](https://github.com/stemangiola/tidySingleCellExperiment) for single-cell data tidy representation -- [tidyseurat](https://github.com/stemangiola/tidyseurat) for single-cell data tidy representation -- [tidyHeatmap](https://github.com/stemangiola/tidyHeatmap) for heatmaps produced with tidy principles -analysis and manipulation -- [tidygate](https://github.com/stemangiola/tidygate) for adding custom -gate information to your tibble +Resources to help you get started with _tidyomics_ and tidybulk: + +* [The tidyomics blog](https://tidyomics.github.io/tidyomicsBlog/) +* [The tidybulk website](http://stemangiola.github.io/tidybulk/) +* [Third party tutorials](https://rstudio-pubs-static.s3.amazonaws.com/792462_f948e766b15d4ee5be5c860493bda0b3.html) + +The _tidyomics_ ecosystem includes packages for: + +* Working with genomic features: + * [plyranges](https://github.com/sa-lee/plyranges), for tidy manipulation of genomic range data. + * [nullranges](https://github.com/nullranges/nullranges), for tidy generation of genomic ranges representing the null hypothesis. + * [plyinteractions](https://github.com/tidyomics/plyinteractions), for tidy manipulation of genomic interaction data. + +* Working with transcriptomic features: - +A few more tidy tools for data manipulation and plotting: + +* [tidyHeatmap](https://github.com/stemangiola/tidyHeatmap), for producing heatmaps with tidy principles. +analysis and manipulation +* [tidygate](https://github.com/stemangiola/tidygate), for interactive plotting and gating. ```{r, echo=FALSE, out.width = "800px"} knitr::include_graphics("man/figures/new_SE_usage-01.png") ``` + ## Functions/utilities available Function | Description ------------ | ------------- +`identify_abundant` | identify the abundant genes `aggregate_duplicates` | Aggregate abundance and annotation of duplicated transcripts in a robust way -`identify_abundant` `keep_abundant` | identify or keep the abundant genes -`keep_variable` | Filter for top variable features `scale_abundance` | Scale (normalise) abundance for RNA sequencing depth -`reduce_dimensions` | Perform dimensionality reduction (PCA, MDS, tSNE, UMAP) +`reduce_dimensions` | Perform dimensionality reduction (PCA, MDS, tSNE) `cluster_elements` | Labels elements with cluster identity (kmeans, SNN) `remove_redundancy` | Filter out elements with highly correlated features `adjust_abundance` | Remove known unwanted variation (Combat) -`test_differential_abundance` | Differential transcript abundance testing (DESeq2, edgeR, voom) -`deconvolve_cellularity` | Estimated tissue composition (Cibersort, llsr, epic, xCell, mcp_counter, quantiseq +`test_differential_abundance` | Differential transcript abundance testing (DE) +`deconvolve_cellularity` | Estimated tissue composition (Cibersort or llsr) `test_differential_cellularity` | Differential cell-type abundance testing -`test_stratification_cellularity` | Estimate Kaplan-Meier survival differences +`keep_variable` | Filter for top variable features +`keep_abundant` | Filter out lowly abundant transcripts `test_gene_enrichment` | Gene enrichment analyses (EGSEA) `test_gene_overrepresentation` | Gene enrichment on list of transcript names (no rank) -`test_gene_rank` | Gene enrichment on list of transcript (GSEA) -`impute_missing_abundance` | Impute abundance for missing data points using sample groupings Utilities | Description @@ -76,11 +88,17 @@ Utilities | Description `ensembl_to_symbol` | Add gene symbol from ensembl IDs `symbol_to_entrez` | Add entrez ID from gene symbol `describe_transcript` | Add gene description from gene symbol +`impute_missing_abundance` | Impute abundance for missing data points using sample groupings +`fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value All functions are directly compatibble with `SummarizedExperiment` object. ```{r, echo=FALSE, include=FALSE, } +library(knitr) +# knitr::opts_chunk$set(cache = TRUE, warning = FALSE, +# message = FALSE, cache.lazy = FALSE) + library(dplyr) library(tidyr) library(tibble) @@ -89,7 +107,6 @@ library(ggplot2) library(ggrepel) library(tidybulk) library(tidySummarizedExperiment) -library(here) my_theme = theme_bw() + @@ -105,11 +122,9 @@ my_theme = axis.title.x = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)), axis.title.y = element_text(margin = margin(t = 10, r = 10, b = 10, l = 10)) ) - utils::download.file("https://zenodo.org/records/11201167/files/counts_SE.rda?download=1", destfile = "counts_SE.rda") here("counts_SE.rda") |> load() tibble_counts = counts_SE |> as_tibble() - ``` ## Installation @@ -129,13 +144,13 @@ devtools::install_github("stemangiola/tidybulk") We will use a `SummarizedExperiment` object ```{r} -counts_SE +se_mini ``` Loading `tidySummarizedExperiment` will automatically abstract this object as `tibble`, so we can display it and manipulate it with tidy tools. Although it looks different, and more tools (tidyverse) are available to us, this object is in fact a `SummarizedExperiment` object. ```{r} -class(counts_SE) +class(se_mini) ``` ## Get the bibliography of your workflow @@ -275,7 +290,7 @@ count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% - cmdscale.out |> + cmdscale.out %>% setNames(sprintf("Dim%s", 1:6)) cmds$cell_type = tibble_counts[ @@ -312,7 +327,7 @@ counts_SE.norm.PCA = Standard procedure (comparative purpose) ```{r,eval=FALSE} count_m_log = log(count_m + 1) -pc = count_m_log |> prcomp(scale = TRUE) +pc = count_m_log %>% prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ @@ -325,7 +340,7 @@ pc$cell_type = counts[ On the x and y axes axis we have the reduced dimensions 1 to 3, data is coloured by cell type. -```{r plot_pca, cache=TRUE} +```{r plot_pca, eval=FALSE} counts_SE.norm.PCA |> pivot_sample() |> select(contains("PC"), everything()) @@ -399,9 +414,9 @@ rotation = function(m, d) { ((bind_rows( c(`1` = cos(r), `2` = -sin(r)), c(`1` = sin(r), `2` = cos(r)) - ) |> as_matrix()) %*% m) + ) %>% as_matrix) %*% m) } -mds_r = pca |> rotation(rotation_degrees) +mds_r = pca %>% rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), "Cell.type" @@ -440,7 +455,7 @@ TidyTranscriptomics counts_SE.de = counts_SE |> test_differential_abundance( ~ condition, action="get") -counts_SE.de +se_mini.de ```
@@ -475,7 +490,7 @@ counts_SE.de = ## Adjust `counts` -We may want to adjust `counts` for (known) unwanted variation. `adjust_abundance` takes as arguments a tibble, column names (as symbols; for `sample`, `transcript` and `count`) and a formula representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation, and returns a tibble with additional columns for the adjusted counts as `_adjusted`. At the moment just an unwanted covariated is allowed at a time. +We may want to adjust `counts` for (known) unwanted variation. `adjust_abundance` takes as arguments a tibble, column names (as symbols; for `sample`, `transcript` and `count`) and a formula representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation, and returns a tibble with additional columns for the adjusted counts as `_adjusted`. At the moment just an unwanted covariates is allowed at a time.
TidyTranscriptomics @@ -534,7 +549,7 @@ Standard procedure (comparative purpose) ```{r, eval=FALSE} source(‘CIBERSORT.R’) -count_m |> write.table("mixture_file.txt") +count_m %>% write.table("mixture_file.txt") results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", @@ -570,7 +585,7 @@ counts_SE.cibersort |> We can also perform a statistical test on the differential cell-type abundance across conditions -```{r DC, cache=TRUE} +```{r DC, eval=FALSE} counts_SE |> test_differential_cellularity(. ~ condition ) @@ -579,8 +594,7 @@ We can also perform a statistical test on the differential cell-type abundance a We can also perform regression analysis with censored data (coxph). -```{r DC_censored} - # Add survival data +```{r DC_censored, eval=FALSE} counts_SE_survival = counts_SE |> @@ -663,7 +677,6 @@ We can add cluster annotation to the MDS dimension reduced data set and plot. Matrix package (v1.3-3) causes an error with Seurat::FindNeighbors used in this method. We are trying to solve this issue. At the moment this option in unaviable. -
TidyTranscriptomics ```{r SNN, eval=FALSE, cache=TRUE, message=FALSE, warning=FALSE, results='hide'} @@ -750,14 +763,14 @@ library(widyr) sort = TRUE, diag = FALSE, upper = FALSE - ) |> - filter(correlation > correlation_threshold) |> - distinct(item1) |> + ) %>% + filter(correlation > correlation_threshold) %>% + distinct(item1) %>% rename(!!.element := item1) # Return non redudant data frame -counts |> anti_join(.data.correlated) |> - spread(sample, rc, - transcript) |> +counts %>% anti_join(.data.correlated) %>% + spread(sample, rc, - transcript) %>% left_join(annotation) @@ -835,5 +848,4 @@ We can add gene full name (and in future description) from symbol identifiers. T counts_SE |> describe_transcript() |> select(feature, description, everything()) -``` - +``` \ No newline at end of file diff --git a/README.md b/README.md index 74c4a5bb..cd693e41 100755 --- a/README.md +++ b/README.md @@ -1,11 +1,12 @@ -tidybulk - part of tidyTranscriptomics +tidybulk - part of *tidyomics* ================ [![Lifecycle:maturing](https://img.shields.io/badge/lifecycle-maturing-blue.svg)](https://www.tidyverse.org/lifecycle/#maturing) [![R build -status](https://github.com/stemangiola/tidybulk/workflows/R-CMD-check-bioc/badge.svg)](https://github.com/stemangiola/tidybulk/actions) +status](https://github.com/stemangiola/tidybulk/workflows/rworkflows/badge.svg)](https://github.com/stemangiola/tidybulk/actions) + **Brings transcriptomics to the tidyverse!** @@ -36,40 +37,44 @@ Please have a look also to [![Build Status](https://travis-ci.org/stemangiola/tidybulk.svg?branch=master)](https://travis-ci.org/stemangiola/tidybulk) [![Coverage Status](https://coveralls.io/repos/github/stemangiola/tidybulk/badge.svg?branch=master)](https://coveralls.io/github/stemangiola/tidybulk?branch=master) --> +[![Build Status](https://travis-ci.org/stemangiola/tidybulk.svg?branch=master)](https://travis-ci.org/stemangiola/tidybulk) [![Coverage Status](https://coveralls.io/repos/github/stemangiola/tidybulk/badge.svg?branch=master)](https://coveralls.io/github/stemangiola/tidybulk?branch=master) + +--> + ## Functions/utilities available -| Function | Description | -|-------------------------------------|------------------------------------------------------------------------------------| -| `aggregate_duplicates` | Aggregate abundance and annotation of duplicated transcripts in a robust way | -| `identify_abundant` `keep_abundant` | identify or keep the abundant genes | -| `keep_variable` | Filter for top variable features | -| `scale_abundance` | Scale (normalise) abundance for RNA sequencing depth | -| `reduce_dimensions` | Perform dimensionality reduction (PCA, MDS, tSNE, UMAP) | -| `cluster_elements` | Labels elements with cluster identity (kmeans, SNN) | -| `remove_redundancy` | Filter out elements with highly correlated features | -| `adjust_abundance` | Remove known unwanted variation (Combat) | -| `test_differential_abundance` | Differential transcript abundance testing (DESeq2, edgeR, voom) | -| `deconvolve_cellularity` | Estimated tissue composition (Cibersort, llsr, epic, xCell, mcp_counter, quantiseq | -| `test_differential_cellularity` | Differential cell-type abundance testing | -| `test_stratification_cellularity` | Estimate Kaplan-Meier survival differences | -| `test_gene_enrichment` | Gene enrichment analyses (EGSEA) | -| `test_gene_overrepresentation` | Gene enrichment on list of transcript names (no rank) | -| `test_gene_rank` | Gene enrichment on list of transcript (GSEA) | -| `impute_missing_abundance` | Impute abundance for missing data points using sample groupings | - -| Utilities | Description | -|-----------------------|--------------------------------------------| -| `get_bibliography` | Get the bibliography of your workflow | -| `tidybulk` | add tidybulk attributes to a tibble object | -| `tidybulk_SAM_BAM` | Convert SAM BAM files into tidybulk tibble | -| `pivot_sample` | Select sample-wise columns/information | -| `pivot_transcript` | Select transcript-wise columns/information | -| `rotate_dimensions` | Rotate two dimensions of a degree | -| `ensembl_to_symbol` | Add gene symbol from ensembl IDs | -| `symbol_to_entrez` | Add entrez ID from gene symbol | -| `describe_transcript` | Add gene description from gene symbol | +| Function | Description | +|---------------------------------|------------------------------------------------------------------------------| +| `identify_abundant` | identify the abundant genes | +| `aggregate_duplicates` | Aggregate abundance and annotation of duplicated transcripts in a robust way | +| `scale_abundance` | Scale (normalise) abundance for RNA sequencing depth | +| `reduce_dimensions` | Perform dimensionality reduction (PCA, MDS, tSNE) | +| `cluster_elements` | Labels elements with cluster identity (kmeans, SNN) | +| `remove_redundancy` | Filter out elements with highly correlated features | +| `adjust_abundance` | Remove known unwanted variation (Combat) | +| `test_differential_abundance` | Differential transcript abundance testing (DE) | +| `deconvolve_cellularity` | Estimated tissue composition (Cibersort or llsr) | +| `test_differential_cellularity` | Differential cell-type abundance testing | +| `keep_variable` | Filter for top variable features | +| `keep_abundant` | Filter out lowly abundant transcripts | +| `test_gene_enrichment` | Gene enrichment analyses (EGSEA) | +| `test_gene_overrepresentation` | Gene enrichment on list of transcript names (no rank) | + +| Utilities | Description | +|----------------------------|-----------------------------------------------------------------| +| `get_bibliography` | Get the bibliography of your workflow | +| `tidybulk` | add tidybulk attributes to a tibble object | +| `tidybulk_SAM_BAM` | Convert SAM BAM files into tidybulk tibble | +| `pivot_sample` | Select sample-wise columns/information | +| `pivot_transcript` | Select transcript-wise columns/information | +| `rotate_dimensions` | Rotate two dimensions of a degree | +| `ensembl_to_symbol` | Add gene symbol from ensembl IDs | +| `symbol_to_entrez` | Add entrez ID from gene symbol | +| `describe_transcript` | Add gene description from gene symbol | +| `impute_missing_abundance` | Impute abundance for missing data points using sample groupings | +| `fill_missing_abundance` | Fill abundance for missing data points using an arbitrary value | All functions are directly compatibble with `SummarizedExperiment` object. @@ -93,7 +98,7 @@ devtools::install_github("stemangiola/tidybulk") We will use a `SummarizedExperiment` object ``` r -counts_SE +se_mini ``` ## # A SummarizedExperiment-tibble abstraction: 408,624 × 8 @@ -118,7 +123,7 @@ tools. Although it looks different, and more tools (tidyverse) are available to us, this object is in fact a `SummarizedExperiment` object. ``` r -class(counts_SE) +class(se_mini) ``` ## [1] "SummarizedExperiment" @@ -319,6 +324,7 @@ count_m_log = log(count_m + 1) cmds = limma::plotMDS(ndim = .dims, plot = FALSE) cmds = cmds %$% + cmdscale.out |> cmdscale.out |> setNames(sprintf("Dim%s", 1:6)) @@ -397,6 +403,7 @@ Standard procedure (comparative purpose) ``` r count_m_log = log(count_m + 1) pc = count_m_log |> prcomp(scale = TRUE) +pc = count_m_log |> prcomp(scale = TRUE) variance = pc$sdev^2 variance = (variance / sum(variance))[1:6] pc$cell_type = counts[ @@ -441,8 +448,6 @@ counts_SE.norm.PCA |> GGally::ggpairs(columns = 11:13, ggplot2::aes(colour=`Cell.type`)) ``` -![](man/figures/plot_pca-1.png) - **tSNE**
@@ -550,8 +555,10 @@ rotation = function(m, d) { c(`1` = cos(r), `2` = -sin(r)), c(`1` = sin(r), `2` = cos(r)) ) |> as_matrix()) %*% m) + ) |> as_matrix()) %*% m) } mds_r = pca |> rotation(rotation_degrees) +mds_r = pca |> rotation(rotation_degrees) mds_r$cell_type = counts[ match(counts$sample, rownames(mds_r)), "Cell.type" @@ -607,7 +614,7 @@ TidyTranscriptomics counts_SE.de = counts_SE |> test_differential_abundance( ~ condition, action="get") -counts_SE.de +se_mini.de ```
@@ -659,7 +666,7 @@ symbols; for `sample`, `transcript` and `count`) and a formula representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation, and returns a tibble with additional columns for the adjusted counts as -`_adjusted`. At the moment just an unwanted covariated is +`_adjusted`. At the moment just an unwanted covariates is allowed at a time.
@@ -735,6 +742,7 @@ Standard procedure (comparative purpose) ``` r source(‘CIBERSORT.R’) count_m |> write.table("mixture_file.txt") +count_m |> write.table("mixture_file.txt") results <- CIBERSORT( "sig_matrix_file.txt", "mixture_file.txt", diff --git a/_pkgdown.yml b/_pkgdown.yml deleted file mode 100644 index 6ef5f5a6..00000000 --- a/_pkgdown.yml +++ /dev/null @@ -1 +0,0 @@ -destination: docs diff --git a/codecov.yml b/codecov.yml deleted file mode 100755 index 8f36b6cc..00000000 --- a/codecov.yml +++ /dev/null @@ -1,12 +0,0 @@ -comment: false - -coverage: - status: - project: - default: - target: auto - threshold: 1% - patch: - default: - target: auto - threshold: 1% diff --git a/dev/dplyr-master-methods.R b/dev/dplyr-master-methods.R old mode 100755 new mode 100644 diff --git a/man/adjust_abundance-methods.Rd b/man/adjust_abundance-methods.Rd index 8ee70a84..11587724 100644 --- a/man/adjust_abundance-methods.Rd +++ b/man/adjust_abundance-methods.Rd @@ -107,13 +107,21 @@ adjust_abundance( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{DEPRECATED - A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch)} +\item{.formula}{DEPRECATED - A formula with no response variable, +representing the desired linear model where the first covariate is the +factor of interest and the second covariate is the unwanted variation +(of the kind ~ factor_of_interest + batch)} -\item{.factor_unwanted}{A tidy select, e.g. column names without double quotation. c(batch, country) These are the factor that we want to adjust for, including unwanted batcheffect, and unwanted biological effects.} +\item{.factor_unwanted}{A tidy select, e.g. column names without double +quotation. c(batch, country) These are the factor that we want to adjust for, +including unwanted batcheffect, and unwanted biological effects.} -\item{.factor_of_interest}{A tidy select, e.g. column names without double quotation. c(treatment) These are the factor that we want to preserve.} +\item{.factor_of_interest}{A tidy select, e.g. column names without double +quotation. c(treatment) These are the factor that we want to preserve.} \item{.sample}{The name of the sample column} @@ -121,54 +129,70 @@ adjust_abundance( \item{.abundance}{The name of the transcript/gene abundance column} -\item{method}{A character string. Methods include combat_seq (default), combat and limma_remove_batch_effect.} +\item{method}{A character string. Methods include combat_seq (default), +combat and limma_remove_batch_effect.} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to the +input tbl (add), or just get the non-redundant tbl with the new information (get).} \item{...}{Further parameters passed to the function sva::ComBat} -\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{DEPRECATED - A boolean, whether the value should be +log-transformed (e.g., TRUE for RNA sequencing data)} -\item{transform}{DEPRECATED - A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{DEPRECATED - A function that will tranform the counts, +by default it is log1p for RNA sequencing data, but for avoinding +tranformation you can use identity} -\item{inverse_transform}{DEPRECATED - A function that is the inverse of transform (e.g. expm1 is inverse of log1p). This is needed to tranform back the counts after analysis.} +\item{inverse_transform}{DEPRECATED - A function that is the inverse of +transform (e.g. expm1 is inverse of log1p). This is needed to tranform +back the counts after analysis.} } \value{ -A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +A consistent object (to the input) with additional columns for +the adjusted counts as `_adjusted` -A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +A consistent object (to the input) with additional columns for the +adjusted counts as `_adjusted` -A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +A consistent object (to the input) with additional columns for the +adjusted counts as `_adjusted` -A consistent object (to the input) with additional columns for the adjusted counts as `_adjusted` +A consistent object (to the input) with additional columns for the +adjusted counts as `_adjusted` A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -adjust_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with an additional adjusted abundance column. This method uses scaled counts if present. +adjust_abundance() takes as input A `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with an additional adjusted abundance column. +This method uses scaled counts if present. } \details{ `r lifecycle::badge("maturing")` This function adjusts the abundance for (known) unwanted variation. -At the moment just an unwanted covariate is allowed at a time using Combat (DOI: 10.1093/bioinformatics/bts034) +At the moment just an unwanted covariate is allowed at a time using +Combat (DOI: 10.1093/bioinformatics/bts034) Underlying method: sva::ComBat(data, batch = my_batch, mod = design, prior.plots = FALSE, ...) } \examples{ - - cm = tidybulk::se_mini cm$batch = 0 cm$batch[colnames(cm) \%in\% c("SRR1740035", "SRR1740043")] = 1 cm |> identify_abundant() |> -adjust_abundance( .factor_unwanted = batch, .factor_of_interest = condition, method="combat" ) +adjust_abundance( .factor_unwanted = batch, +.factor_of_interest = condition, method="combat" ) } diff --git a/man/aggregate_duplicates-methods.Rd b/man/aggregate_duplicates-methods.Rd index 8a38c69b..ecfe699b 100644 --- a/man/aggregate_duplicates-methods.Rd +++ b/man/aggregate_duplicates-methods.Rd @@ -8,7 +8,8 @@ \alias{aggregate_duplicates,tidybulk-method} \alias{aggregate_duplicates,SummarizedExperiment-method} \alias{aggregate_duplicates,RangedSummarizedExperiment-method} -\title{Aggregates multiple counts from the same samples (e.g., from isoforms), concatenates other character columns, and averages other numeric columns} +\title{Aggregates multiple counts from the same samples (e.g., from isoforms), +concatenates other character columns, and averages other numeric columns} \usage{ aggregate_duplicates( .data, @@ -65,7 +66,9 @@ aggregate_duplicates( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -73,25 +76,34 @@ aggregate_duplicates( \item{.abundance}{The name of the transcript/gene abundance column} -\item{aggregation_function}{A function for counts aggregation (e.g., sum, median, or mean)} +\item{aggregation_function}{A function for counts aggregation (e.g., sum, +median, or mean)} \item{keep_integer}{A boolean. Whether to force the aggregated counts to integer} } \value{ -A consistent object (to the input) with aggregated transcript abundance and annotation +A consistent object (to the input) with aggregated transcript +abundance and annotation -A consistent object (to the input) with aggregated transcript abundance and annotation +A consistent object (to the input) with aggregated transcript +abundance and annotation -A consistent object (to the input) with aggregated transcript abundance and annotation +A consistent object (to the input) with aggregated transcript +abundance and annotation -A consistent object (to the input) with aggregated transcript abundance and annotation +A consistent object (to the input) with aggregated transcript +abundance and annotation A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -aggregate_duplicates() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with aggregated transcripts that were duplicated. +aggregate_duplicates() takes as input A `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with aggregated transcripts that were duplicated. } \details{ `r lifecycle::badge("maturing")` diff --git a/man/arrange-methods.Rd b/man/arrange-methods.Rd deleted file mode 100644 index 477fb465..00000000 --- a/man/arrange-methods.Rd +++ /dev/null @@ -1,66 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{arrange} -\alias{arrange} -\title{Arrange rows by column values} -\arguments{ -\item{.data}{A data frame, data frame extension (e.g. a tibble), or a -lazy data frame (e.g. from dbplyr or dtplyr). See *Methods*, below, for -more details.} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Variables, or functions or -variables. Use [desc()] to sort a variable in descending order.} - -\item{.by_group}{If TRUE, will sort first by grouping variable. Applies to grouped data frames only.} -} -\value{ -An object of the same type as `.data`. - -* All rows appear in the output, but (usually) in a different place. -* Columns are not modified. -* Groups are not modified. -* Data frame attributes are preserved. - -A tibble -} -\description{ -`arrange()` order the rows of a data frame rows by the values of selected -columns. - -Unlike other dplyr verbs, `arrange()` largely ignores grouping; you -need to explicit mention grouping variables (or use `by_group = TRUE`) -in order to group by them, and functions of variables are evaluated -once per data frame, not once per group. -} -\details{ -## Locales -The sort order for character vectors will depend on the collating sequence -of the locale in use: see [locales()]. - -## Missing values -Unlike base sorting with `sort()`, `NA` are: -* always sorted to the end for local data, even when wrapped with `desc()`. -* treated differently for remote data, depending on the backend. -} -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -arrange(mtcars, cyl, disp) - -} -\seealso{ -Other single table verbs: -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/arrange.Rd b/man/arrange.Rd new file mode 100644 index 00000000..58180ef4 --- /dev/null +++ b/man/arrange.Rd @@ -0,0 +1,91 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{arrange} +\alias{arrange} +\alias{arrange.tidybulk} +\title{Order rows using column values} +\usage{ +\method{arrange}{tidybulk}(.data, ..., .by_group = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Variables, or +functions of variables. Use \code{\link[dplyr:desc]{desc()}} to sort a variable in descending +order.} + +\item{.by_group}{If \code{TRUE}, will sort first by grouping variable. Applies to +grouped data frames only.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item All rows appear in the output, but (usually) in a different place. +\item Columns are not modified. +\item Groups are not modified. +\item Data frame attributes are preserved. +} +} +\description{ +\code{arrange()} orders the rows of a data frame by the values of selected +columns. + +Unlike other dplyr verbs, \code{arrange()} largely ignores grouping; you +need to explicitly mention grouping variables (or use \code{.by_group = TRUE}) +in order to group by them, and functions of variables are evaluated +once per data frame, not once per group. +} +\details{ +\subsection{Missing values}{ + +Unlike base sorting with \code{sort()}, \code{NA} are: +\itemize{ +\item always sorted to the end for local data, even when wrapped with \code{desc()}. +\item treated differently for remote data, depending on the backend. +} +} +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("arrange")}. + +} + +\examples{ +arrange(mtcars, cyl, disp) +arrange(mtcars, desc(disp)) + +# grouped arrange ignores groups +by_cyl <- mtcars \%>\% group_by(cyl) +by_cyl \%>\% arrange(desc(wt)) +# Unless you specifically ask: +by_cyl \%>\% arrange(desc(wt), .by_group = TRUE) + +# use embracing when wrapping in a function; +# see ?rlang::args_data_masking for more details +tidy_eval_arrange <- function(.data, var) { + .data \%>\% + arrange({{ var }}) +} +tidy_eval_arrange(mtcars, mpg) + +# Use `across()` or `pick()` to select columns with tidy-select +iris \%>\% arrange(pick(starts_with("Sepal"))) +iris \%>\% arrange(across(starts_with("Sepal"), desc)) +} +\seealso{ +Other single table verbs: +\code{\link{mutate}()}, +\code{\link{rename}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/as_SummarizedExperiment-methods.Rd b/man/as_SummarizedExperiment-methods.Rd index 374e2f35..bb737668 100644 --- a/man/as_SummarizedExperiment-methods.Rd +++ b/man/as_SummarizedExperiment-methods.Rd @@ -55,5 +55,7 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -as_SummarizedExperiment() creates a `SummarizedExperiment` object from a `tbl` or `tidybulk` tbl formatted as | | | | <...> | +as_SummarizedExperiment() creates a `SummarizedExperiment` + object from a `tbl` or `tidybulk` tbl formatted as | | + | | <...> | } diff --git a/man/as_matrix.Rd b/man/as_matrix.Rd index f03e4296..4a217ba8 100644 --- a/man/as_matrix.Rd +++ b/man/as_matrix.Rd @@ -9,7 +9,8 @@ as_matrix(tbl, rownames = NULL, do_check = TRUE) \arguments{ \item{tbl}{A tibble} -\item{rownames}{The column name of the input tibble that will become the rownames of the output matrix} +\item{rownames}{The column name of the input tibble that will become +the rownames of the output matrix} \item{do_check}{A boolean} } diff --git a/man/bind_rows.Rd b/man/bind_rows.Rd index b98c804e..0ac66681 100644 --- a/man/bind_rows.Rd +++ b/man/bind_rows.Rd @@ -2,7 +2,15 @@ % Please edit documentation in R/dplyr_methods.R \name{bind_rows} \alias{bind_rows} +\alias{bind_rows.tidybulk} +\alias{bind_cols.tidybulk} +\alias{bind_cols} \title{Efficiently bind multiple data frames by row and column} +\usage{ +\method{bind_rows}{tidybulk}(..., .id = NULL) + +\method{bind_cols}{tidybulk}(..., .id = NULL) +} \arguments{ \item{...}{Data frames to combine. @@ -24,29 +32,37 @@ list of data frames is supplied, the labels are taken from the names of the list. If no names are found a numeric sequence is used instead.} - -\item{add.cell.ids}{from Seurat 3.0 A character vector of length(x = c(x, y)). Appends the corresponding values to the start of each objects' cell names.} } \value{ +`bind_rows()` and `bind_cols()` return the same type as + the first input, either a data frame, `tbl_df`, or `grouped_df`. + `bind_rows()` and `bind_cols()` return the same type as the first input, either a data frame, `tbl_df`, or `grouped_df`. } \description{ +This is an efficient implementation of the common pattern of +`do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many +data frames into one. + This is an efficient implementation of the common pattern of `do.call(rbind, dfs)` or `do.call(cbind, dfs)` for binding many data frames into one. } \details{ +The output of `bind_rows()` will contain a column if that column +appears in any of the inputs. + The output of `bind_rows()` will contain a column if that column appears in any of the inputs. } \examples{ data(se_mini) -se_mini_tidybulk = se_mini |> tidybulk() -bind_rows( se_mini_tidybulk, se_mini_tidybulk ) +se_mini_tidybulk <- se_mini |> tidybulk() +bind_rows(se_mini_tidybulk, se_mini_tidybulk) -tt_bind = se_mini_tidybulk |> select(time, condition) +tt_bind <- se_mini_tidybulk |> select(time, condition) se_mini_tidybulk |> bind_cols(tt_bind) } diff --git a/man/cluster_elements-methods.Rd b/man/cluster_elements-methods.Rd index e1302075..4264f521 100644 --- a/man/cluster_elements-methods.Rd +++ b/man/cluster_elements-methods.Rd @@ -89,25 +89,35 @@ cluster_elements( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, +feature and transcript abundance) or `SummarizedExperiment` +(more convenient if abstracted to tibble with +library(tidySummarizedExperiment))} \item{.element}{The name of the element column (normally samples).} \item{.feature}{The name of the feature column (normally transcripts/genes)} -\item{.abundance}{The name of the column including the numerical value the clustering is based on (normally transcript abundance)} +\item{.abundance}{The name of the column including the numerical value the +clustering is based on (normally transcript abundance)} -\item{method}{A character string. The cluster algorithm to use, at the moment k-means is the only algorithm included.} +\item{method}{A character string. The cluster algorithm to use, at the +moment k-means is the only algorithm included.} -\item{of_samples}{A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column} +\item{of_samples}{A boolean. In case the input is a tidybulk object, +it indicates Whether the element column will be sample or transcript column} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{A function that will tranform the counts, by default it is +log1p for RNA sequencing data, but for avoinding tranformation you can +use identity} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to +the input tbl (add), or just get the non-redundant tbl with the new information (get).} \item{...}{Further parameters passed to the function kmeans} -\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{DEPRECATED - A boolean, whether the value should be +log-transformed (e.g., TRUE for RNA sequencing data)} } \value{ A tbl object with additional columns with cluster labels @@ -123,14 +133,19 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -cluster_elements() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and identify clusters in the data. +cluster_elements() takes as input A `tbl` (with at least + three columns for sample, feature and transcript abundance) or + `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) and identify clusters in the data. } \details{ `r lifecycle::badge("maturing")` identifies clusters in the data, normally of samples. This function returns a tibble with additional columns for the cluster annotation. -At the moment only k-means (DOI: 10.2307/2346830) and SNN clustering (DOI:10.1016/j.cell.2019.05.031) is supported, the plan is to introduce more clustering methods. +At the moment only k-means (DOI: 10.2307/2346830) and SNN clustering +(DOI:10.1016/j.cell.2019.05.031) is supported, the plan is to introduce more +clustering methods. Underlying method for kmeans do.call(kmeans(.data, iter.max = 1000, ...) diff --git a/man/deconvolve_cellularity-methods.Rd b/man/deconvolve_cellularity-methods.Rd index 9125e4d9..48b4c20d 100644 --- a/man/deconvolve_cellularity-methods.Rd +++ b/man/deconvolve_cellularity-methods.Rd @@ -83,7 +83,9 @@ deconvolve_cellularity( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -91,31 +93,48 @@ deconvolve_cellularity( \item{.abundance}{The name of the transcript/gene abundance column} -\item{reference}{A data frame. The methods cibersort and llsr can accept a custom rectangular dataframe with genes as rows names, cell types as column names and gene-transcript abundance as values. For exampler tidybulk::X_cibersort. The transcript/cell_type data frame of integer transcript abundance. If NULL, the default reference for each algorithm will be used. For llsr will be LM22.} +\item{reference}{A data frame. The methods cibersort and llsr can accept a +custom rectangular dataframe with genes as rows names, cell types as column +names and gene-transcript abundance as values. For exampler tidybulk::X_cibersort. +The transcript/cell_type data frame of integer transcript abundance. If NULL, +the default reference for each algorithm will be used. For llsr will be LM22.} -\item{method}{A character string. The method to be used. At the moment Cibersort (default, can accept custom reference), epic (can accept custom reference) and llsr (linear least squares regression, can accept custom reference), mcp_counter, quantiseq, xcell are available.} +\item{method}{A character string. The method to be used. At the moment +Cibersort (default, can accept custom reference), epic (can accept custom +reference) and llsr (linear least squares regression, can accept custom +reference), mcp_counter, quantiseq, xcell are available.} -\item{prefix}{A character string. The prefix you would like to add to the result columns. It is useful if you want to reshape data.} +\item{prefix}{A character string. The prefix you would like to add to the +result columns. It is useful if you want to reshape data.} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to the +input tbl (add), or just get the non-redundant tbl with the new information (get).} \item{...}{Further parameters passed to the function Cibersort} } \value{ -A consistent object (to the input) including additional columns for each cell type estimated +A consistent object (to the input) including additional columns +for each cell type estimated -A consistent object (to the input) including additional columns for each cell type estimated +A consistent object (to the input) including additional columns +for each cell type estimated -A consistent object (to the input) including additional columns for each cell type estimated +A consistent object (to the input) including additional columns +for each cell type estimated -A consistent object (to the input) including additional columns for each cell type estimated +A consistent object (to the input) including additional columns +for each cell type estimated A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -deconvolve_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with the estimated cell type abundance for each sample +deconvolve_cellularity() takes as input A `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object (to the +input) with the estimated cell type abundance for each sample } \details{ `r lifecycle::badge("maturing")` @@ -128,7 +147,6 @@ CIBERSORT(Y = data, X = reference, ...) } \examples{ - # Subsetting for time efficiency tidybulk::se_mini |> deconvolve_cellularity(cores = 1) diff --git a/man/describe_transcript-methods.Rd b/man/describe_transcript-methods.Rd index 8bf4e5ee..4e243201 100644 --- a/man/describe_transcript-methods.Rd +++ b/man/describe_transcript-methods.Rd @@ -41,9 +41,11 @@ A consistent object (to the input) including additional columns for transcript s A `SummarizedExperiment` object -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional +columns for transcript symbol -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional columns +for transcript symbol } \description{ Get DESCRIPTION from gene SYMBOL for Human and Mouse diff --git a/man/distinct-methods.Rd b/man/distinct-methods.Rd deleted file mode 100644 index 6bf62635..00000000 --- a/man/distinct-methods.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{distinct} -\alias{distinct} -\title{distinct} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} - -\item{.keep_all}{If TRUE, keep all variables in .data. If a combination of ... is not distinct, this keeps the first row of values. (See dplyr)} -} -\value{ -A tt object -} -\description{ -distinct -} -\examples{ - -tidybulk::se_mini |> tidybulk() |> distinct() - - -} diff --git a/man/distinct.Rd b/man/distinct.Rd new file mode 100644 index 00000000..b362a370 --- /dev/null +++ b/man/distinct.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{distinct} +\alias{distinct} +\alias{distinct.tidybulk} +\title{Keep distinct/unique rows} +\usage{ +\method{distinct}{tidybulk}(.data, ..., .keep_all = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Optional variables to +use when determining uniqueness. If there are multiple rows for a given +combination of inputs, only the first row will be preserved. If omitted, +will use all variables in the data frame.} + +\item{.keep_all}{If \code{TRUE}, keep all variables in \code{.data}. +If a combination of \code{...} is not distinct, this keeps the +first row of values.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Rows are a subset of the input but appear in the same order. +\item Columns are not modified if \code{...} is empty or \code{.keep_all} is \code{TRUE}. +Otherwise, \code{distinct()} first calls \code{mutate()} to create new columns. +\item Groups are not modified. +\item Data frame attributes are preserved. +} +} +\description{ +Keep only unique/distinct rows from a data frame. This is similar +to \code{\link[=unique.data.frame]{unique.data.frame()}} but considerably faster. +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("distinct")}. + +} + +\examples{ +data(se_mini) +se_mini |> tidybulk() |> distinct() + +} diff --git a/man/dplyr-methods.Rd b/man/dplyr-methods.Rd deleted file mode 100644 index fbdf8de4..00000000 --- a/man/dplyr-methods.Rd +++ /dev/null @@ -1,31 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{bind_cols} -\alias{bind_cols} -\alias{left_join} -\title{Left join datasets} -\arguments{ -\item{x}{tbls to join. (See dplyr)} - -\item{y}{tbls to join. (See dplyr)} - -\item{by}{A character vector of variables to join by. (See dplyr)} - -\item{copy}{If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr)} - -\item{suffix}{If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} -} -\value{ -A tt object -} -\description{ -Left join datasets -} -\examples{ - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) - -} diff --git a/man/ensembl_to_symbol-methods.Rd b/man/ensembl_to_symbol-methods.Rd index 2ea0a01a..c528883c 100644 --- a/man/ensembl_to_symbol-methods.Rd +++ b/man/ensembl_to_symbol-methods.Rd @@ -17,33 +17,46 @@ ensembl_to_symbol(.data, .ensembl, action = "add") \S4method{ensembl_to_symbol}{tidybulk}(.data, .ensembl, action = "add") } \arguments{ -\item{.data}{a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{a `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.ensembl}{A character string. The column that is represents ensembl gene id} +\item{.ensembl}{A character string. The column that is represents +ensembl gene id} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information +to the input tbl (add), or just get the non-redundant tbl with the new +information (get).} } \value{ -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional columns +for transcript symbol -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional columns +for transcript symbol -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional columns +for transcript symbol -A consistent object (to the input) including additional columns for transcript symbol +A consistent object (to the input) including additional columns +for transcript symbol } \description{ -ensembl_to_symbol() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with the additional transcript symbol column +ensembl_to_symbol() takes as input a `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with the additional transcript symbol column } \details{ \lifecycle{questioning} -This is useful since different resources use ensembl IDs while others use gene symbol IDs. At the moment this work for human (genes and transcripts) and mouse (genes) data. +This is useful since different resources use ensembl IDs while +others use gene symbol IDs. At the moment this work for human (genes and +transcripts) and mouse (genes) data. } \examples{ - - # This function was designed for data.frame # Convert from SummarizedExperiment for this example. It is NOT reccomended. diff --git a/man/fill_missing_abundance-methods.Rd b/man/fill_missing_abundance-methods.Rd index b0abf168..062f22ea 100644 --- a/man/fill_missing_abundance-methods.Rd +++ b/man/fill_missing_abundance-methods.Rd @@ -61,12 +61,17 @@ A consistent object (to the input) with filled abundance A consistent object (to the input) with filled abundance } \description{ -fill_missing_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with new observations +fill_missing_abundance() takes as input A `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with new observations } \details{ \lifecycle{questioning} -This function fills the abundance of missing sample-transcript pair using the median of the sample group defined by the formula +This function fills the abundance of missing sample-transcript +pair using the median of the sample group defined by the formula } \examples{ diff --git a/man/filter-methods.Rd b/man/filter-methods.Rd deleted file mode 100644 index c9a6c68b..00000000 --- a/man/filter-methods.Rd +++ /dev/null @@ -1,85 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{filter} -\alias{filter} -\title{Subset rows using column values} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Logical predicates defined in -terms of the variables in `.data`. -Multiple conditions are combined with `&`. Only rows where the -condition evaluates to `TRUE` are kept.} - -\item{.preserve}{when `FALSE` (the default), the grouping structure -is recalculated based on the resulting data, otherwise it is kept as is.} -} -\value{ -An object of the same type as `.data`. - -* Rows are a subset of the input, but appear in the same order. -* Columns are not modified. -* The number of groups may be reduced (if `.preserve` is not `TRUE`). -* Data frame attributes are preserved. -} -\description{ -`filter()` retains the rows where the conditions you provide a `TRUE`. Note -that, unlike base subsetting with `[`, rows where the condition evaluates -to `NA` are dropped. -} -\details{ -dplyr is not yet smart enough to optimise filtering optimisation -on grouped datasets that don't need grouped calculations. For this reason, -filtering is often considerably faster on [ungroup()]ed data. -} -\section{Useful filter functions}{ - - -* [`==`], [`>`], [`>=`] etc -* [`&`], [`|`], [`!`], [xor()] -* [is.na()] -* [between()], [near()] -} - -\section{Grouped tibbles}{ - - -Because filtering expressions are computed within groups, they may -yield different results on grouped tibbles. This will be the case -as soon as an aggregating, lagging, or ranking function is -involved. Compare this ungrouped filtering: - - -The former keeps rows with `mass` greater than the global average -whereas the latter keeps rows with `mass` greater than the gender - -average. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -data(se) - -se |> tidybulk() |> filter(dex=="untrt") - -# Learn more in ?dplyr_tidy_eval -} -\seealso{ -[filter_all()], [filter_if()] and [filter_at()]. - -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{mutate}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/filter.Rd b/man/filter.Rd new file mode 100644 index 00000000..7f568a20 --- /dev/null +++ b/man/filter.Rd @@ -0,0 +1,116 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{filter} +\alias{filter} +\alias{filter.tidybulk} +\title{Keep rows that match a condition} +\usage{ +\method{filter}{tidybulk}(.data, ..., .preserve = FALSE) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Expressions that +return a logical value, and are defined in terms of the variables in +\code{.data}. If multiple expressions are included, they are combined with the +\code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are +kept.} + +\item{.preserve}{Relevant when the \code{.data} input is grouped. +If \code{.preserve = FALSE} (the default), the grouping structure +is recalculated based on the resulting data, otherwise the grouping is kept as is.} +} +\value{ +An object of the same type as \code{.data}. The output has the following properties: +\itemize{ +\item Rows are a subset of the input, but appear in the same order. +\item Columns are not modified. +\item The number of groups may be reduced (if \code{.preserve} is not \code{TRUE}). +\item Data frame attributes are preserved. +} +} +\description{ +The \code{filter()} function is used to subset a data frame, +retaining all rows that satisfy your conditions. +To be retained, the row must produce a value of \code{TRUE} for all conditions. +Note that when a condition evaluates to \code{NA} +the row will be dropped, unlike base subsetting with \code{[}. +} +\details{ +The \code{filter()} function is used to subset the rows of +\code{.data}, applying the expressions in \code{...} to the column values to determine which +rows should be retained. It can be applied to both grouped and ungrouped data (see \code{\link[dplyr:group_by]{group_by()}} and +\code{\link[dplyr:ungroup]{ungroup()}}). However, dplyr is not yet smart enough to optimise the filtering +operation on grouped datasets that do not need grouped calculations. For this +reason, filtering is often considerably faster on ungrouped data. +} +\section{Useful filter functions}{ + + + +There are many functions and operators that are useful when constructing the +expressions used to filter the data: +\itemize{ +\item \code{\link{==}}, \code{\link{>}}, \code{\link{>=}} etc +\item \code{\link{&}}, \code{\link{|}}, \code{\link{!}}, \code{\link[=xor]{xor()}} +\item \code{\link[=is.na]{is.na()}} +\item \code{\link[dplyr:between]{between()}}, \code{\link[dplyr:near]{near()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because filtering expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped filtering: + +\if{html}{\out{
}}\preformatted{starwars \%>\% filter(mass > mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% group_by(gender) \%>\% filter(mass > mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +In the ungrouped version, \code{filter()} compares the value of \code{mass} in each row to +the global average (taken over the whole data set), keeping only the rows with +\code{mass} greater than this global average. In contrast, the grouped version calculates +the average mass separately for each \code{gender} group, and keeps rows with \code{mass} greater +than the relevant within-gender average. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. + +} + +\examples{ +data(se) +se |> tidybulk() |> filter(dex=="untrt") +# Learn more in ?dplyr_tidy_eval + +} +\seealso{ +Other single table verbs: +\code{\link[dplyr]{arrange}()}, +\code{\link[dplyr]{mutate}()}, +\code{\link[dplyr]{reframe}()}, +\code{\link[dplyr]{rename}()}, +\code{\link[dplyr]{select}()}, +\code{\link[dplyr]{slice}()}, +\code{\link[dplyr]{summarise}()} +} diff --git a/man/full_join.Rd b/man/full_join.Rd new file mode 100644 index 00000000..1a60ee82 --- /dev/null +++ b/man/full_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{full_join} +\alias{full_join} +\alias{full_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{full_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/get_bibliography-methods.Rd b/man/get_bibliography-methods.Rd index 759e8a0e..93b7f728 100644 --- a/man/get_bibliography-methods.Rd +++ b/man/get_bibliography-methods.Rd @@ -26,14 +26,21 @@ get_bibliography(.data) \S4method{get_bibliography}{RangedSummarizedExperiment}(.data) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} } \value{ -NULL. It prints a list of bibliography references for the software used through the workflow. +NULL. It prints a list of bibliography references for the software +used through the workflow. -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). } \description{ get_bibliography() takes as input a `tidybulk` @@ -41,13 +48,11 @@ get_bibliography() takes as input a `tidybulk` \details{ `r lifecycle::badge("maturing")` -This methods returns the bibliography list of your workflow from the internals of a tidybulk object (attr(., "internals")) +This methods returns the bibliography list of your workflow from +the internals of a tidybulk object (attr(., "internals")) } \examples{ - - get_bibliography(tidybulk::se_mini) - } diff --git a/man/get_reduced_dimensions_UMAP_bulk.Rd b/man/get_reduced_dimensions_UMAP_bulk.Rd index 9404148d..696fdbec 100644 --- a/man/get_reduced_dimensions_UMAP_bulk.Rd +++ b/man/get_reduced_dimensions_UMAP_bulk.Rd @@ -21,23 +21,30 @@ get_reduced_dimensions_UMAP_bulk( \arguments{ \item{.data}{A tibble} -\item{.element}{A column symbol. The column that is used to calculate distance (i.e., normally samples)} +\item{.element}{A column symbol. The column that is used to calculate +distance (i.e., normally samples)} -\item{.feature}{A column symbol. The column that is represents entities to cluster (i.e., normally genes)} +\item{.feature}{A column symbol. The column that is represents entities +to cluster (i.e., normally genes)} -\item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} +\item{.abundance}{A column symbol with the value the clustering +is based on (e.g., `count`)} -\item{.dims}{A integer vector corresponding to principal components of interest (e.g., 1:6)} +\item{.dims}{A integer vector corresponding to principal components +of interest (e.g., 1:6)} \item{top}{An integer. How many top genes to select} \item{of_samples}{A boolean} -\item{calculate_for_pca_dimensions}{An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered} +\item{calculate_for_pca_dimensions}{An integer of length one. The number of +PCA dimensions to based the UMAP calculatio on. +If NULL all variable features are considered} \item{...}{Further parameters passed to the function uwot} -\item{log_transform}{A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{A boolean, whether the value should be log-transformed +(e.g., TRUE for RNA sequencing data)} } \value{ A tibble with additional columns diff --git a/man/get_reduced_dimensions_UMAP_bulk_SE.Rd b/man/get_reduced_dimensions_UMAP_bulk_SE.Rd index 163b194c..5d9c5add 100644 --- a/man/get_reduced_dimensions_UMAP_bulk_SE.Rd +++ b/man/get_reduced_dimensions_UMAP_bulk_SE.Rd @@ -18,23 +18,30 @@ get_reduced_dimensions_UMAP_bulk_SE( \arguments{ \item{.data}{A tibble} -\item{.dims}{A integer vector corresponding to principal components of interest (e.g., 1:6)} +\item{.dims}{A integer vector corresponding to principal components of +interest (e.g., 1:6)} \item{top}{An integer. How many top genes to select} \item{of_samples}{A boolean} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{A function that will tranform the counts, by default it is +log1p for RNA sequencing data, but for avoinding tranformation you can use identity} -\item{calculate_for_pca_dimensions}{An integer of length one. The number of PCA dimensions to based the UMAP calculatio on. If NULL all variable features are considered} +\item{calculate_for_pca_dimensions}{An integer of length one. The number of +PCA dimensions to based the UMAP calculatio on. If NULL all variable +features are considered} \item{...}{Further parameters passed to the function uwot} -\item{.abundance}{A column symbol with the value the clustering is based on (e.g., `count`)} +\item{.abundance}{A column symbol with the value the clustering is +based on (e.g., `count`)} -\item{.feature}{A column symbol. The column that is represents entities to cluster (i.e., normally genes)} +\item{.feature}{A column symbol. The column that is represents entities to +cluster (i.e., normally genes)} -\item{.element}{A column symbol. The column that is used to calculate distance (i.e., normally samples)} +\item{.element}{A column symbol. The column that is used to calculate +distance (i.e., normally samples)} } \value{ A tibble with additional columns diff --git a/man/group_by-methods.Rd b/man/group_by-methods.Rd deleted file mode 100644 index 3fcd8adc..00000000 --- a/man/group_by-methods.Rd +++ /dev/null @@ -1,47 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{group_by} -\alias{group_by} -\title{Group by one or more variables} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{In `group_by()`, variables or computations to group by. -In `ungroup()`, variables to remove from the grouping.} - -\item{.add}{When `FALSE`, the default, `group_by()` will - override existing groups. To add to the existing groups, use - `.add = TRUE`. - - This argument was previously called `add`, but that prevented - creating a new grouping variable called `add`, and conflicts with - our naming conventions.} - -\item{.drop}{When `.drop = TRUE`, empty groups are dropped. See [group_by_drop_default()] for -what the default value is for this argument.} -} -\value{ -A [grouped data frame][grouped_df()], unless the combination of `...` and `add` - yields a non empty set of grouping columns, a regular (ungrouped) data frame - otherwise. -} -\description{ -Most data operations are done on groups defined by variables. -`group_by()` takes an existing tbl and converts it into a grouped tbl -where operations are performed "by group". `ungroup()` removes grouping. -} -\section{Methods}{ - -These function are **generic**s, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -Methods available in currently loaded packages: -} - -\examples{ - -by_cyl <- mtcars |> group_by(cyl) - -} -\concept{grouping functions} diff --git a/man/group_by.Rd b/man/group_by.Rd new file mode 100644 index 00000000..5ff1f037 --- /dev/null +++ b/man/group_by.Rd @@ -0,0 +1,162 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{group_by} +\alias{group_by} +\alias{group_by.tidybulk} +\title{Group by one or more variables} +\usage{ +\method{group_by}{tidybulk}(.data, ..., .add = FALSE, .drop = group_by_drop_default(.data)) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{In \code{group_by()}, variables or computations to group by. +Computations are always done on the ungrouped data frame. +To perform computations on the grouped data, you need to use +a separate \code{mutate()} step before the \code{group_by()}. +Computations are not allowed in \code{nest_by()}. +In \code{ungroup()}, variables to remove from the grouping.} + +\item{.add}{When \code{FALSE}, the default, \code{group_by()} will +override existing groups. To add to the existing groups, use +\code{.add = TRUE}. + +This argument was previously called \code{add}, but that prevented +creating a new grouping variable called \code{add}, and conflicts with +our naming conventions.} + +\item{.drop}{Drop groups formed by factor levels that don't appear in the +data? The default is \code{TRUE} except when \code{.data} has been previously +grouped with \code{.drop = FALSE}. See \code{\link[dplyr:group_by_drop_default]{group_by_drop_default()}} for details.} +} +\value{ +A grouped data frame with class \code{\link[dplyr]{grouped_df}}, +unless the combination of \code{...} and \code{add} yields a empty set of +grouping columns, in which case a tibble will be returned. +} +\description{ +Most data operations are done on groups defined by variables. +\code{group_by()} takes an existing tbl and converts it into a grouped tbl +where operations are performed "by group". \code{ungroup()} removes grouping. +} +\section{Methods}{ + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{group_by()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +\item \code{ungroup()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +} + +} + +\section{Ordering}{ + + +Currently, \code{group_by()} internally orders the groups in ascending order. This +results in ordered output from functions that aggregate groups, such as +\code{\link[dplyr:summarise]{summarise()}}. + +When used as grouping columns, character vectors are ordered in the C locale +for performance and reproducibility across R sessions. If the resulting +ordering of your grouped operation matters and is dependent on the locale, +you should follow up the grouped operation with an explicit call to +\code{\link[dplyr:arrange]{arrange()}} and set the \code{.locale} argument. For example: + +\if{html}{\out{
}}\preformatted{data \%>\% + group_by(chr) \%>\% + summarise(avg = mean(x)) \%>\% + arrange(chr, .locale = "en") +}\if{html}{\out{
}} + +This is often useful as a preliminary step before generating content intended +for humans, such as an HTML table. +\subsection{Legacy behavior}{ + +Prior to dplyr 1.1.0, character vector grouping columns were ordered in the +system locale. If you need to temporarily revert to this behavior, you can +set the global option \code{dplyr.legacy_locale} to \code{TRUE}, but this should be +used sparingly and you should expect this option to be removed in a future +version of dplyr. It is better to update existing code to explicitly call +\code{arrange(.locale = )} instead. Note that setting \code{dplyr.legacy_locale} will +also force calls to \code{\link[dplyr:arrange]{arrange()}} to use the system locale. +} + +} + +\examples{ +by_cyl <- mtcars \%>\% group_by(cyl) + +# grouping doesn't change how the data looks (apart from listing +# how it's grouped): +by_cyl + +# It changes how it acts with the other dplyr verbs: +by_cyl \%>\% summarise( + disp = mean(disp), + hp = mean(hp) +) +by_cyl \%>\% filter(disp == max(disp)) + +# Each call to summarise() removes a layer of grouping +by_vs_am <- mtcars \%>\% group_by(vs, am) +by_vs <- by_vs_am \%>\% summarise(n = n()) +by_vs +by_vs \%>\% summarise(n = sum(n)) + +# To removing grouping, use ungroup +by_vs \%>\% + ungroup() \%>\% + summarise(n = sum(n)) + +# By default, group_by() overrides existing grouping +by_cyl \%>\% + group_by(vs, am) \%>\% + group_vars() + +# Use add = TRUE to instead append +by_cyl \%>\% + group_by(vs, am, .add = TRUE) \%>\% + group_vars() + +# You can group by expressions: this is a short-hand +# for a mutate() followed by a group_by() +mtcars \%>\% + group_by(vsam = vs + am) + +# The implicit mutate() step is always performed on the +# ungrouped data. Here we get 3 groups: +mtcars \%>\% + group_by(vs) \%>\% + group_by(hp_cut = cut(hp, 3)) + +# If you want it to be performed by groups, +# you have to use an explicit mutate() call. +# Here we get 3 groups per value of vs +mtcars \%>\% + group_by(vs) \%>\% + mutate(hp_cut = cut(hp, 3)) \%>\% + group_by(hp_cut) + +# when factors are involved and .drop = FALSE, groups can be empty +tbl <- tibble( + x = 1:10, + y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +) +tbl \%>\% + group_by(y, .drop = FALSE) \%>\% + group_rows() +} +\seealso{ +Other grouping functions: +\code{\link[dplyr]{group_map}()}, +\code{\link[dplyr]{group_nest}()}, +\code{\link[dplyr]{group_split}()}, +\code{\link[dplyr]{group_trim}()} +} diff --git a/man/identify_abundant-methods.Rd b/man/identify_abundant-methods.Rd index 0bc9ba8d..b27ce70c 100644 --- a/man/identify_abundant-methods.Rd +++ b/man/identify_abundant-methods.Rd @@ -71,7 +71,9 @@ identify_abundant( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -79,27 +81,45 @@ identify_abundant( \item{.abundance}{The name of the transcript/gene abundance column} -\item{factor_of_interest}{The name of the column of the factor of interest. This is used for defining sample groups for the filtering process. It uses the filterByExpr function from edgeR.} +\item{factor_of_interest}{The name of the column of the factor of interest. +This is used for defining sample groups for the filtering process. It uses +the filterByExpr function from edgeR.} -\item{minimum_counts}{A real positive number. It is the threshold of count per million that is used to filter transcripts/genes out from the scaling procedure.} +\item{minimum_counts}{A real positive number. It is the threshold of count +per million that is used to filter transcripts/genes out from the scaling procedure.} -\item{minimum_proportion}{A real positive number between 0 and 1. It is the threshold of proportion of samples for each transcripts/genes that have to be characterised by a cmp bigger than the threshold to be included for scaling procedure.} +\item{minimum_proportion}{A real positive number between 0 and 1. It is the +threshold of proportion of samples for each transcripts/genes that have to +be characterised by a cmp bigger than the threshold to be included for +scaling procedure.} } \value{ -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -identify_abundant() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +identify_abundant() takes as input A `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional columns for the statistics from the hypothesis test. } \details{ `r lifecycle::badge("maturing")` @@ -116,8 +136,6 @@ At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) } \examples{ - - identify_abundant( tidybulk::se_mini ) diff --git a/man/impute_missing_abundance-methods.Rd b/man/impute_missing_abundance-methods.Rd index 6781eb7d..f8abf71f 100644 --- a/man/impute_missing_abundance-methods.Rd +++ b/man/impute_missing_abundance-methods.Rd @@ -71,9 +71,13 @@ impute_missing_abundance( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula with no response variable, representing the desired linear model where the first covariate is the factor of interest and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch)} +\item{.formula}{A formula with no response variable, representing the +desired linear model where the first covariate is the factor of interest +and the second covariate is the unwanted variation (of the kind ~ factor_of_interest + batch)} \item{.sample}{The name of the sample column} @@ -81,9 +85,13 @@ impute_missing_abundance( \item{.abundance}{The name of the transcript/gene abundance column} -\item{suffix}{A character string. This is added to the imputed count column names. If empty the count column are overwritten} +\item{suffix}{A character string. This is added to the imputed count column +names. If empty the count column are overwritten} -\item{force_scaling}{A boolean. In case a abundance-containing column is not scaled (columns with _scale suffix), setting force_scaling = TRUE will result in a scaling by library size, to compensating for a possible difference in sequencing depth.} +\item{force_scaling}{A boolean. In case a abundance-containing column is not +scaled (columns with _scale suffix), setting force_scaling = TRUE will result +in a scaling by library size, to compensating for a possible difference in +sequencing depth.} } \value{ A consistent object (to the input) non-sparse abundance @@ -99,12 +107,18 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -impute_missing_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional sample-transcript pairs with imputed transcript abundance. +impute_missing_abundance() takes as input A `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional sample-transcript pairs with imputed +transcript abundance. } \details{ `r lifecycle::badge("maturing")` -This function imputes the abundance of missing sample-transcript pair using the median of the sample group defined by the formula +This function imputes the abundance of missing sample-transcript +pair using the median of the sample group defined by the formula } \examples{ diff --git a/man/inner_join.Rd b/man/inner_join.Rd new file mode 100644 index 00000000..1556c9f4 --- /dev/null +++ b/man/inner_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{inner_join} +\alias{inner_join} +\alias{inner_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{inner_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- tidybulk::se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/join-methods.Rd b/man/join-methods.Rd deleted file mode 100644 index 0e4d8ac6..00000000 --- a/man/join-methods.Rd +++ /dev/null @@ -1,48 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{inner_join} -\alias{inner_join} -\alias{right_join} -\alias{full_join} -\title{Inner join datasets} -\arguments{ -\item{x}{tbls to join. (See dplyr)} - -\item{y}{tbls to join. (See dplyr)} - -\item{by}{A character vector of variables to join by. (See dplyr)} - -\item{copy}{If x and y are not from the same data source, and copy is TRUE, then y will be copied into the same src as x. (See dplyr)} - -\item{suffix}{If there are non-joined duplicate variables in x and y, these suffixes will be added to the output to disambiguate them. Should be a character vector of length 2. (See dplyr)} - -\item{...}{Data frames to combine (See dplyr)} -} -\value{ -A tt object - -A tt object - -A tt object -} -\description{ -Inner join datasets - -Right join datasets - -Full join datasets -} -\examples{ - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> inner_join(annotation) - - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) - - -annotation = tidybulk::se_mini |> tidybulk() |> as_tibble() |> distinct(.sample) |> mutate(source = "AU") -tidybulk::se_mini |> tidybulk() |> as_tibble() |> full_join(annotation) - -} diff --git a/man/keep_abundant-methods.Rd b/man/keep_abundant-methods.Rd index efe98f34..b52b58ca 100644 --- a/man/keep_abundant-methods.Rd +++ b/man/keep_abundant-methods.Rd @@ -71,7 +71,9 @@ keep_abundant( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -79,27 +81,47 @@ keep_abundant( \item{.abundance}{The name of the transcript/gene abundance column} -\item{factor_of_interest}{The name of the column of the factor of interest. This is used for defining sample groups for the filtering process. It uses the filterByExpr function from edgeR.} +\item{factor_of_interest}{The name of the column of the factor of interest. +This is used for defining sample groups for the filtering process. It uses +the filterByExpr function from edgeR.} -\item{minimum_counts}{A real positive number. It is the threshold of count per million that is used to filter transcripts/genes out from the scaling procedure.} +\item{minimum_counts}{A real positive number. It is the threshold of count +per million that is used to filter transcripts/genes out from the scaling +procedure.} -\item{minimum_proportion}{A real positive number between 0 and 1. It is the threshold of proportion of samples for each transcripts/genes that have to be characterised by a cmp bigger than the threshold to be included for scaling procedure.} +\item{minimum_proportion}{A real positive number between 0 and 1. It is the +threshold of proportion of samples for each transcripts/genes that have to +be characterised by a cmp bigger than the threshold to be included for +scaling procedure.} } \value{ -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for +the statistics from the hypothesis test (e.g., log fold change, p-value +and false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -keep_abundant() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +keep_abundant() takes as input A `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional columns for the statistics from the +hypothesis test. } \details{ \lifecycle{questioning} @@ -115,9 +137,6 @@ At the moment this function uses edgeR (DOI: 10.1093/bioinformatics/btp616) ) } \examples{ - - - keep_abundant( tidybulk::se_mini ) diff --git a/man/keep_variable-methods.Rd b/man/keep_variable-methods.Rd index 6c257ebb..251c26f7 100644 --- a/man/keep_variable-methods.Rd +++ b/man/keep_variable-methods.Rd @@ -55,7 +55,9 @@ keep_variable( \S4method{keep_variable}{RangedSummarizedExperiment}(.data, top = 500, transform = log1p) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -65,12 +67,17 @@ keep_variable( \item{top}{Integer. Number of top transcript to consider} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{A function that will tranform the counts, by default it is +log1p for RNA sequencing data, but for avoinding tranformation you can use +identity} -\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{DEPRECATED - A boolean, whether the value should be +log-transformed (e.g., TRUE for RNA sequencing data)} } \value{ -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). Underlying method: s <- rowMeans((x - rowMeans(x)) ^ 2) @@ -78,28 +85,38 @@ o <- order(s, decreasing = TRUE) x <- x[o[1L:top], , drop = FALSE] variable_trancripts = rownames(x) -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for +the statistics from the hypothesis test (e.g., log fold change, p-value +and false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -keep_variable() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +keep_variable() takes as input A `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional columns for the statistics from the +hypothesis test. } \details{ `r lifecycle::badge("maturing")` -At the moment this function uses edgeR \url{https://doi.org/10.1093/bioinformatics/btp616} +At the moment this function uses edgeR +\url{https://doi.org/10.1093/bioinformatics/btp616} } \examples{ - - keep_variable(tidybulk::se_mini, top = 500) diff --git a/man/left_join.Rd b/man/left_join.Rd new file mode 100644 index 00000000..58657133 --- /dev/null +++ b/man/left_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{left_join} +\alias{left_join} +\alias{left_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{left_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> left_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/mutate-methods.Rd b/man/mutate-methods.Rd deleted file mode 100644 index cc79af02..00000000 --- a/man/mutate-methods.Rd +++ /dev/null @@ -1,103 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{mutate} -\alias{mutate} -\title{Create, modify, and delete columns} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs. - The name gives the name of the column in the output. - - The value can be: - - * A vector of length 1, which will be recycled to the correct length. - * A vector the same length as the current group (or the whole data frame - if ungrouped). - * `NULL`, to remove the column. - * A data frame or tibble, to create multiple columns in the output.} -} -\value{ -An object of the same type as `.data`. - -For `mutate()`: - -* Rows are not affected. -* Existing columns will be preserved unless explicitly modified. -* New columns will be added to the right of existing columns. -* Columns given value `NULL` will be removed -* Groups will be recomputed if a grouping variable is mutated. -* Data frame attributes are preserved. - -For `transmute()`: - -* Rows are not affected. -* Apart from grouping variables, existing columns will be remove unless - explicitly kept. -* Column order matches order of expressions. -* Groups will be recomputed if a grouping variable is mutated. -* Data frame attributes are preserved. -} -\description{ -`mutate()` adds new variables and preserves existing ones; -`transmute()` adds new variables and drops existing ones. -New variables overwrite existing variables of the same name. -Variables can be removed by setting their value to `NULL`. -} -\section{Useful mutate functions}{ - - -* [`+`], [`-`], [log()], etc., for their usual mathematical meanings - -* [lead()], [lag()] - -* [dense_rank()], [min_rank()], [percent_rank()], [row_number()], - [cume_dist()], [ntile()] - -* [cumsum()], [cummean()], [cummin()], [cummax()], [cumany()], [cumall()] - -* [na_if()], [coalesce()] - -* [if_else()], [recode()], [case_when()] -} - -\section{Grouped tibbles}{ - - -Because mutating expressions are computed within groups, they may -yield different results on grouped tibbles. This will be the case -as soon as an aggregating, lagging, or ranking function is -involved. Compare this ungrouped mutate: - -With the grouped equivalent: - -The former normalises `mass` by the global average whereas the -latter normalises by the averages within gender levels. -} - -\section{Methods}{ - -These function are **generic**s, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -Methods available in currently loaded packages: -} - -\examples{ - -# Newly created variables are available immediately -mtcars |> as_tibble() |> mutate( - cyl2 = cyl * 2, - cyl4 = cyl2 * 2 -) - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{rename}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/mutate.Rd b/man/mutate.Rd new file mode 100644 index 00000000..b2b48431 --- /dev/null +++ b/man/mutate.Rd @@ -0,0 +1,173 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{mutate} +\alias{mutate} +\alias{mutate.tidybulk} +\title{Create, modify, and delete columns} +\usage{ +\method{mutate}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. +The name gives the name of the column in the output. + +The value can be: +\itemize{ +\item A vector of length 1, which will be recycled to the correct length. +\item A vector the same length as the current group (or the whole data frame +if ungrouped). +\item \code{NULL}, to remove the column. +\item A data frame or tibble, to create multiple columns in the output. +}} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Columns from \code{.data} will be preserved according to the \code{.keep} argument. +\item Existing columns that are modified by \code{...} will always be returned in +their original location. +\item New columns created through \code{...} will be placed according to the +\code{.before} and \code{.after} arguments. +\item The number of rows is not affected. +\item Columns given the value \code{NULL} will be removed. +\item Groups will be recomputed if a grouping variable is mutated. +\item Data frame attributes are preserved. +} +} +\description{ +\code{mutate()} creates new columns that are functions of existing variables. +It can also modify (if the name is the same as an existing +column) and delete columns (by setting their value to \code{NULL}). +} +\section{Useful mutate functions}{ + + +\itemize{ +\item \code{\link{+}}, \code{\link{-}}, \code{\link[=log]{log()}}, etc., for their usual mathematical meanings +\item \code{\link[dplyr:lead]{lead()}}, \code{\link[dplyr:lag]{lag()}} +\item \code{\link[dplyr:dense_rank]{dense_rank()}}, \code{\link[dplyr:min_rank]{min_rank()}}, \code{\link[dplyr:percent_rank]{percent_rank()}}, \code{\link[dplyr:row_number]{row_number()}}, +\code{\link[dplyr:cume_dist]{cume_dist()}}, \code{\link[dplyr:ntile]{ntile()}} +\item \code{\link[=cumsum]{cumsum()}}, \code{\link[dplyr:cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[dplyr:cumany]{cumany()}}, \code{\link[dplyr:cumall]{cumall()}} +\item \code{\link[dplyr:na_if]{na_if()}}, \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:if_else]{if_else()}}, \code{\link[dplyr:recode]{recode()}}, \code{\link[dplyr:case_when]{case_when()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because mutating expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped mutate: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +The former normalises \code{mass} by the global average whereas the +latter normalises by the averages within species levels. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. + +} + +\examples{ +# Newly created variables are available immediately +starwars \%>\% + select(name, mass) \%>\% + mutate( + mass2 = mass * 2, + mass2_squared = mass2 * mass2 + ) + +# As well as adding new variables, you can use mutate() to +# remove variables and modify existing variables. +starwars \%>\% + select(name, height, mass, homeworld) \%>\% + mutate( + mass = NULL, + height = height * 0.0328084 # convert to feet + ) + +# Use across() with mutate() to apply a transformation +# to multiple columns in a tibble. +starwars \%>\% + select(name, homeworld, species) \%>\% + mutate(across(!name, as.factor)) +# see more in ?across + +# Window functions are useful for grouped mutates: +starwars \%>\% + select(name, mass, homeworld) \%>\% + group_by(homeworld) \%>\% + mutate(rank = min_rank(desc(mass))) +# see `vignette("window-functions")` for more details + +# By default, new columns are placed on the far right. +df <- tibble(x = 1, y = 2) +df \%>\% mutate(z = x + y) +df \%>\% mutate(z = x + y, .before = 1) +df \%>\% mutate(z = x + y, .after = x) + +# By default, mutate() keeps all columns from the input data. +df <- tibble(x = 1, y = 2, a = "a", b = "b") +df \%>\% mutate(z = x + y, .keep = "all") # the default +df \%>\% mutate(z = x + y, .keep = "used") +df \%>\% mutate(z = x + y, .keep = "unused") +df \%>\% mutate(z = x + y, .keep = "none") + +# Grouping ---------------------------------------- +# The mutate operation may yield different results on grouped +# tibbles because the expressions are computed within groups. +# The following normalises `mass` by the global average: +starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Whereas this normalises `mass` by the averages within species +# levels: +starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Indirection ---------------------------------------- +# Refer to column names stored as strings with the `.data` pronoun: +vars <- c("mass", "height") +mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) +# Learn more in ?rlang::args_data_masking +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{rename}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/mutate.nested_tidybulk.Rd b/man/mutate.nested_tidybulk.Rd new file mode 100644 index 00000000..f388c83b --- /dev/null +++ b/man/mutate.nested_tidybulk.Rd @@ -0,0 +1,175 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{mutate.nested_tidybulk} +\alias{mutate.nested_tidybulk} +\title{Create, modify, and delete columns} +\usage{ +\method{mutate}{nested_tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs. +The name gives the name of the column in the output. + +The value can be: +\itemize{ +\item A vector of length 1, which will be recycled to the correct length. +\item A vector the same length as the current group (or the whole data frame +if ungrouped). +\item \code{NULL}, to remove the column. +\item A data frame or tibble, to create multiple columns in the output. +}} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Columns from \code{.data} will be preserved according to the \code{.keep} argument. +\item Existing columns that are modified by \code{...} will always be returned in +their original location. +\item New columns created through \code{...} will be placed according to the +\code{.before} and \code{.after} arguments. +\item The number of rows is not affected. +\item Columns given the value \code{NULL} will be removed. +\item Groups will be recomputed if a grouping variable is mutated. +\item Data frame attributes are preserved. +} +} +\description{ +\code{mutate()} creates new columns that are functions of existing variables. +It can also modify (if the name is the same as an existing +column) and delete columns (by setting their value to \code{NULL}). +} +\section{Useful mutate functions}{ + + +\itemize{ +\item \code{\link{+}}, \code{\link{-}}, \code{\link[=log]{log()}}, etc., for their usual mathematical meanings +\item \code{\link[dplyr:lead]{lead()}}, \code{\link[dplyr:lag]{lag()}} +\item \code{\link[dplyr:dense_rank]{dense_rank()}}, \code{\link[dplyr:min_rank]{min_rank()}}, \code{\link[dplyr:percent_rank]{percent_rank()}}, \code{\link[dplyr:row_number]{row_number()}}, +\code{\link[dplyr:cume_dist]{cume_dist()}}, \code{\link[dplyr:ntile]{ntile()}} +\item \code{\link[=cumsum]{cumsum()}}, \code{\link[dplyr:cummean]{cummean()}}, \code{\link[=cummin]{cummin()}}, \code{\link[=cummax]{cummax()}}, \code{\link[dplyr:cumany]{cumany()}}, \code{\link[dplyr:cumall]{cumall()}} +\item \code{\link[dplyr:na_if]{na_if()}}, \code{\link[dplyr:coalesce]{coalesce()}} +\item \code{\link[dplyr:if_else]{if_else()}}, \code{\link[dplyr:recode]{recode()}}, \code{\link[dplyr:case_when]{case_when()}} +} + +} + +\section{Grouped tibbles}{ + + + +Because mutating expressions are computed within groups, they may +yield different results on grouped tibbles. This will be the case +as soon as an aggregating, lagging, or ranking function is +involved. Compare this ungrouped mutate: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +With the grouped equivalent: + +\if{html}{\out{
}}\preformatted{starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) +}\if{html}{\out{
}} + +The former normalises \code{mass} by the global average whereas the +latter normalises by the averages within species levels. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("mutate")}. + +} + +\examples{ +# Newly created variables are available immediately +starwars \%>\% + select(name, mass) \%>\% + mutate( + mass2 = mass * 2, + mass2_squared = mass2 * mass2 + ) + +# As well as adding new variables, you can use mutate() to +# remove variables and modify existing variables. +starwars \%>\% + select(name, height, mass, homeworld) \%>\% + mutate( + mass = NULL, + height = height * 0.0328084 # convert to feet + ) + +# Use across() with mutate() to apply a transformation +# to multiple columns in a tibble. +starwars \%>\% + select(name, homeworld, species) \%>\% + mutate(across(!name, as.factor)) +# see more in ?across + +# Window functions are useful for grouped mutates: +starwars \%>\% + select(name, mass, homeworld) \%>\% + group_by(homeworld) \%>\% + mutate(rank = min_rank(desc(mass))) +# see `vignette("window-functions")` for more details + +# By default, new columns are placed on the far right. +df <- tibble(x = 1, y = 2) +df \%>\% mutate(z = x + y) +df \%>\% mutate(z = x + y, .before = 1) +df \%>\% mutate(z = x + y, .after = x) + +# By default, mutate() keeps all columns from the input data. +df <- tibble(x = 1, y = 2, a = "a", b = "b") +df \%>\% mutate(z = x + y, .keep = "all") # the default +df \%>\% mutate(z = x + y, .keep = "used") +df \%>\% mutate(z = x + y, .keep = "unused") +df \%>\% mutate(z = x + y, .keep = "none") + +# Grouping ---------------------------------------- +# The mutate operation may yield different results on grouped +# tibbles because the expressions are computed within groups. +# The following normalises `mass` by the global average: +starwars \%>\% + select(name, mass, species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Whereas this normalises `mass` by the averages within species +# levels: +starwars \%>\% + select(name, mass, species) \%>\% + group_by(species) \%>\% + mutate(mass_norm = mass / mean(mass, na.rm = TRUE)) + +# Indirection ---------------------------------------- +# Refer to column names stored as strings with the `.data` pronoun: +vars <- c("mass", "height") +mutate(starwars, prod = .data[[vars[[1]]]] * .data[[vars[[2]]]]) +# Learn more in ?rlang::args_data_masking +} +\seealso{ +Other single table verbs: +\code{\link[dplyr]{arrange}()}, +\code{\link[dplyr]{filter}()}, +\code{\link[dplyr]{reframe}()}, +\code{\link[dplyr]{rename}()}, +\code{\link[dplyr]{select}()}, +\code{\link[dplyr]{slice}()}, +\code{\link[dplyr]{summarise}()} +} diff --git a/man/nest-methods.Rd b/man/nest-methods.Rd deleted file mode 100644 index 3e86e90d..00000000 --- a/man/nest-methods.Rd +++ /dev/null @@ -1,61 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tidyr_methods.R -\name{unnest} -\alias{unnest} -\alias{nest} -\title{unnest} -\arguments{ -\item{data}{A tbl. (See tidyr)} - -\item{cols}{<[`tidy-select`][tidyr_tidy_select]> Columns to unnest. -If you `unnest()` multiple columns, parallel entries must be of -compatibble sizes, i.e. they're either equal or length 1 (following the -standard tidyverse recycling rules).} - -\item{names_sep}{If `NULL`, the default, the names will be left - as is. In `nest()`, inner names will come from the former outer names; - in `unnest()`, the new outer names will come from the inner names. - - If a string, the inner and outer names will be used together. In `nest()`, - the names of the new outer columns will be formed by pasting together the - outer and the inner column names, separated by `names_sep`. In `unnest()`, - the new inner names will have the outer names (+ `names_sep`) automatically - stripped. This makes `names_sep` roughly symmetric between nesting and unnesting.} - -\item{keep_empty}{See tidyr::unnest} - -\item{names_repair}{See tidyr::unnest} - -\item{ptype}{See tidyr::unnest} - -\item{.drop}{See tidyr::unnest} - -\item{.id}{tidyr::unnest} - -\item{.sep}{tidyr::unnest} - -\item{.preserve}{See tidyr::unnest} - -\item{.data}{A tbl. (See tidyr)} - -\item{...}{Name-variable pairs of the form new_col = c(col1, col2, col3) (See tidyr)} -} -\value{ -A tidySummarizedExperiment objector a tibble depending on input - -A tt object -} -\description{ -unnest - -nest -} -\examples{ - - -tidybulk::se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) - - -tidybulk::se_mini \%>\% tidybulk() \%>\% nest( data = -.feature) - -} diff --git a/man/nest.Rd b/man/nest.Rd new file mode 100644 index 00000000..d244d59a --- /dev/null +++ b/man/nest.Rd @@ -0,0 +1,84 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyr_methods.R +\name{nest} +\alias{nest} +\alias{nest.tidybulk} +\title{Nest rows into a list-column of data frames} +\usage{ +\method{nest}{tidybulk}(.data, ..., .names_sep = NULL) +} +\arguments{ +\item{.data}{A data frame.} + +\item{...}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> Columns to nest; these will +appear in the inner data frames. + +Specified using name-variable pairs of the form +\code{new_col = c(col1, col2, col3)}. The right hand side can be any valid +tidyselect expression. + +If not supplied, then \code{...} is derived as all columns \emph{not} selected by +\code{.by}, and will use the column name from \code{.key}. + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +previously you could write \code{df \%>\% nest(x, y, z)}. +Convert to \code{df \%>\% nest(data = c(x, y, z))}.} + +\item{.names_sep}{If \code{NULL}, the default, the inner names will come from +the former outer names. If a string, the new inner names will use the +outer names with \code{names_sep} automatically stripped. This makes +\code{names_sep} roughly symmetric between nesting and unnesting.} +} +\description{ +Nesting creates a list-column of data frames; unnesting flattens it back out +into regular columns. Nesting is implicitly a summarising operation: you +get one row for each group defined by the non-nested columns. This is useful +in conjunction with other summaries that work with whole datasets, most +notably models. + +Learn more in \code{vignette("nest")}. +} +\details{ +If neither \code{...} nor \code{.by} are supplied, \code{nest()} will nest all variables, +and will use the column name supplied through \code{.key}. +} +\section{New syntax}{ + + +tidyr 1.0.0 introduced a new syntax for \code{nest()} and \code{unnest()} that's +designed to be more similar to other functions. Converting to the new syntax +should be straightforward (guided by the message you'll receive) but if +you just need to run an old analysis, you can easily revert to the previous +behaviour using \code{\link[tidyr:nest_legacy]{nest_legacy()}} and \code{\link[tidyr:unnest_legacy]{unnest_legacy()}} as follows: + +\if{html}{\out{
}}\preformatted{library(tidyr) +nest <- nest_legacy +unnest <- unnest_legacy +}\if{html}{\out{
}} + +} + +\section{Grouped data frames}{ + + +\code{df \%>\% nest(data = c(x, y))} specifies the columns to be nested; i.e. the +columns that will appear in the inner data frame. \code{df \%>\% nest(.by = c(x, y))} specifies the columns to nest \emph{by}; i.e. the columns that will remain in +the outer data frame. An alternative way to achieve the latter is to \code{nest()} +a grouped data frame created by \code{\link[dplyr:group_by]{dplyr::group_by()}}. The grouping variables +remain in the outer data frame and the others are nested. The result +preserves the grouping of the input. + +Variables supplied to \code{nest()} will override grouping variables so that +\code{df \%>\% group_by(x, y) \%>\% nest(data = !z)} will be equivalent to +\code{df \%>\% nest(data = !z)}. + +You can't supply \code{.by} with a grouped data frame, as the groups already +represent what you are nesting by. + +} + +\examples{ +data(se_mini) +se_mini \%>\% tidybulk() \%>\% nest(data = -.feature) + +} diff --git a/man/pivot_sample-methods.Rd b/man/pivot_sample-methods.Rd index d23701ad..035a1492 100644 --- a/man/pivot_sample-methods.Rd +++ b/man/pivot_sample-methods.Rd @@ -23,7 +23,9 @@ pivot_sample(.data, .sample = NULL) \S4method{pivot_sample}{RangedSummarizedExperiment}(.data, .sample = NULL) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} } @@ -35,12 +37,17 @@ A consistent object (to the input) A consistent object (to the input) } \description{ -pivot_sample() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with only sample-related columns +pivot_sample() takes as input a `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` with only sample-related columns } \details{ `r lifecycle::badge("maturing")` -This functon extracts only sample-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. +This functon extracts only sample-related information for +downstream analysis (e.g., visualisation). It is disruptive in the sense +that it cannot be passed anymore to tidybulk function. } \examples{ diff --git a/man/pivot_transcript-methods.Rd b/man/pivot_transcript-methods.Rd index 98d7c70d..cfcb0675 100644 --- a/man/pivot_transcript-methods.Rd +++ b/man/pivot_transcript-methods.Rd @@ -23,7 +23,9 @@ pivot_transcript(.data, .transcript = NULL) \S4method{pivot_transcript}{RangedSummarizedExperiment}(.data, .transcript = NULL) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.transcript}{The name of the transcript column} } @@ -35,12 +37,18 @@ A consistent object (to the input) A consistent object (to the input) } \description{ -pivot_transcript() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with only transcript-related columns +pivot_transcript() takes as input a `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` with only +transcript-related columns } \details{ `r lifecycle::badge("maturing")` -This functon extracts only transcript-related information for downstream analysis (e.g., visualisation). It is disruptive in the sense that it cannot be passed anymore to tidybulk function. +This functon extracts only transcript-related information for +downstream analysis (e.g., visualisation). It is disruptive in the sense +that it cannot be passed anymore to tidybulk function. } \examples{ diff --git a/man/quantile_normalise_abundance-methods.Rd b/man/quantile_normalise_abundance-methods.Rd index d7938677..b5f04566 100644 --- a/man/quantile_normalise_abundance-methods.Rd +++ b/man/quantile_normalise_abundance-methods.Rd @@ -71,7 +71,10 @@ quantile_normalise_abundance( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, +feature and transcript abundance) or `SummarizedExperiment` +(more convenient if abstracted to tibble with +library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -79,32 +82,59 @@ quantile_normalise_abundance( \item{.abundance}{The name of the transcript/gene abundance column} +<<<<<<< HEAD \item{method}{A character string. Either "limma_normalize_quantiles" for limma::normalizeQuantiles or "preprocesscore_normalize_quantiles_use_target" for preprocessCore::normalize.quantiles.use.target for large-scale datasets.} \item{target_distribution}{A numeric vector. If NULL the target distribution will be calculated by preprocessCore. This argument only affects the "preprocesscore_normalize_quantiles_use_target" method.} - -\item{action}{A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information.} +======= +\item{method}{A character string. Either "limma_normalize_quantiles" +for limma::normalizeQuantiles or +"preprocesscore_normalize_quantiles_use_target" for +preprocessCore::normalize.quantiles.use.target for large-scale dataset, +where limmma could not be compatible.} +>>>>>>> chilampoon-improve-documentation + +\item{action}{A character string between "add" (default) and "only". +"add" joins the new information to the input tbl (default), +"only" return a non-redundant tbl with the just new information.} } \value{ -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data as + `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data + as `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -quantile_normalise_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +quantile_normalise_abundance() takes as input A `tbl` + (with at least three columns for sample, feature and transcript abundance) + or `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) and Scales transcript abundance + compansating for sequencing depth (e.g., with TMM algorithm, Robinson + and Oshlack doi.org/10.1186/gb-2010-11-3-r25). } \details{ `r lifecycle::badge("maturing")` +<<<<<<< HEAD Tranform the feature abundance across samples so to have the same quantile distribution (using preprocessCore). +======= +Scales transcript abundance compensating for sequencing depth +(e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +Lowly transcribed transcripts/genes (defined with minimum_counts and +minimum_proportion parameters) are filtered out from the scaling procedure. +The scaling inference is then applied back to all unfiltered data. +>>>>>>> chilampoon-improve-documentation Underlying method diff --git a/man/reduce_dimensions-methods.Rd b/man/reduce_dimensions-methods.Rd index b0c60955..ac87bf96 100644 --- a/man/reduce_dimensions-methods.Rd +++ b/man/reduce_dimensions-methods.Rd @@ -107,31 +107,45 @@ reduce_dimensions( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, +feature and transcript abundance) or `SummarizedExperiment` (more +convenient if abstracted to tibble with library(tidySummarizedExperiment))} \item{.element}{The name of the element column (normally samples).} \item{.feature}{The name of the feature column (normally transcripts/genes)} -\item{.abundance}{The name of the column including the numerical value the clustering is based on (normally transcript abundance)} +\item{.abundance}{The name of the column including the numerical value +the clustering is based on (normally transcript abundance)} -\item{method}{A character string. The dimension reduction algorithm to use (PCA, MDS, tSNE).} +\item{method}{A character string. The dimension reduction algorithm to +use (PCA, MDS, tSNE).} -\item{.dims}{An integer. The number of dimensions your are interested in (e.g., 4 for returning the first four principal components).} +\item{.dims}{An integer. The number of dimensions your are interested in +(e.g., 4 for returning the first four principal components).} \item{top}{An integer. How many top genes to select for dimensionality reduction} -\item{of_samples}{A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column} +\item{of_samples}{A boolean. In case the input is a tidybulk object, +it indicates Whether the element column will be sample or transcript column} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{A function that will tranform the counts, by default +it is log1p for RNA sequencing data, but for avoinding tranformation you +can use identity} -\item{scale}{A boolean for method="PCA", this will be passed to the `prcomp` function. It is not included in the ... argument because although the default for `prcomp` if FALSE, it is advisable to set it as TRUE.} +\item{scale}{A boolean for method="PCA", this will be passed to the `prcomp` +function. It is not included in the ... argument because although the +default for `prcomp` if FALSE, it is advisable to set it as TRUE.} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to +the input tbl (add), or just get the non-redundant tbl with the new +information (get).} -\item{...}{Further parameters passed to the function prcomp if you choose method="PCA" or Rtsne if you choose method="tSNE"} +\item{...}{Further parameters passed to the function prcomp if you choose +method="PCA" or Rtsne if you choose method="tSNE"} -\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{DEPRECATED - A boolean, whether the value should be +log-transformed (e.g., TRUE for RNA sequencing data)} } \value{ A tbl object with additional columns for the reduced dimensions @@ -147,7 +161,11 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -reduce_dimensions() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and calculates the reduced dimensional space of the transcript abundance. +reduce_dimensions() takes as input A `tbl` (with at least + three columns for sample, feature and transcript abundance) or + `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) and calculates the reduced dimensional + space of the transcript abundance. } \details{ `r lifecycle::badge("maturing")` diff --git a/man/reexports.Rd b/man/reexports.Rd index 8bc39e1e..10ab6d8b 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -3,8 +3,8 @@ \docType{import} \name{reexports} \alias{reexports} -\alias{select} \alias{do} +\alias{select} \alias{tibble} \alias{as_tibble} \title{Objects exported from other packages} diff --git a/man/remove_redundancy-methods.Rd b/man/remove_redundancy-methods.Rd index 58a05cf1..a55f3d84 100644 --- a/man/remove_redundancy-methods.Rd +++ b/man/remove_redundancy-methods.Rd @@ -8,7 +8,8 @@ \alias{remove_redundancy,tidybulk-method} \alias{remove_redundancy,SummarizedExperiment-method} \alias{remove_redundancy,RangedSummarizedExperiment-method} -\title{Drop redundant elements (e.g., samples) for which feature (e.g., transcript/gene) abundances are correlated} +\title{Drop redundant elements (e.g., samples) for which feature (e.g., +transcript/gene) abundances are correlated} \usage{ remove_redundancy( .data, @@ -101,29 +102,40 @@ remove_redundancy( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.element}{The name of the element column (normally samples).} \item{.feature}{The name of the feature column (normally transcripts/genes)} -\item{.abundance}{The name of the column including the numerical value the clustering is based on (normally transcript abundance)} +\item{.abundance}{The name of the column including the numerical value the +clustering is based on (normally transcript abundance)} -\item{method}{A character string. The method to use, correlation and reduced_dimensions are available. The latter eliminates one of the most proximar pairs of samples in PCA reduced dimensions.} +\item{method}{A character string. The method to use, correlation and +reduced_dimensions are available. The latter eliminates one of the most +proximar pairs of samples in PCA reduced dimensions.} -\item{of_samples}{A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column} +\item{of_samples}{A boolean. In case the input is a tidybulk object, +it indicates Whether the element column will be sample or transcript column} \item{correlation_threshold}{A real number between 0 and 1. For correlation based calculation.} \item{top}{An integer. How many top genes to select for correlation based method} -\item{transform}{A function that will tranform the counts, by default it is log1p for RNA sequencing data, but for avoinding tranformation you can use identity} +\item{transform}{A function that will tranform the counts, by default it +is log1p for RNA sequencing data, but for avoinding tranformation you +can use identity} -\item{Dim_a_column}{A character string. For reduced_dimension based calculation. The column of one principal component} +\item{Dim_a_column}{A character string. For reduced_dimension based +calculation. The column of one principal component} -\item{Dim_b_column}{A character string. For reduced_dimension based calculation. The column of another principal component} +\item{Dim_b_column}{A character string. For reduced_dimension based +calculation. The column of another principal component} -\item{log_transform}{DEPRECATED - A boolean, whether the value should be log-transformed (e.g., TRUE for RNA sequencing data)} +\item{log_transform}{DEPRECATED - A boolean, whether the value +should be log-transformed (e.g., TRUE for RNA sequencing data)} } \value{ A tbl object with with dropped redundant elements (e.g., samples). @@ -139,20 +151,30 @@ A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -remove_redundancy() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) for correlation method or | | | <...> | for reduced_dimensions method, and returns a consistent object (to the input) with dropped elements (e.g., samples). +remove_redundancy() takes as input A `tbl` (with at least + three columns for sample, feature and transcript abundance) or + `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) for correlation method or | + | | <...> | for reduced_dimensions method, + and returns a consistent object (to the input) with dropped + elements (e.g., samples). } \details{ `r lifecycle::badge("maturing")` -This function removes redundant elements from the original data set (e.g., samples or transcripts). -For example, if we want to define cell-type specific signatures with low sample redundancy. +This function removes redundant elements from the original + data set (e.g., samples or transcripts). +For example, if we want to define cell-type specific signatures with +low sample redundancy. This function returns a tibble with dropped redundant elements (e.g., samples). Two redundancy estimation approaches are supported: -(i) removal of highly correlated clusters of elements (keeping a representative) with method="correlation"; +(i) removal of highly correlated clusters of elements (keeping a +representative) with method="correlation"; (ii) removal of most proximal element pairs in a reduced dimensional space. Underlying method for correlation: -widyr::pairwise_cor(sample, transcript,count, sort = TRUE, diag = FALSE, upper = FALSE) +widyr::pairwise_cor(sample, transcript,count, sort = TRUE, + diag = FALSE, upper = FALSE) Underlying custom method for reduced dimensions: select_closest_pairs = function(df) { @@ -169,14 +191,10 @@ select_closest_pairs = function(df) { !`sample 2` %in% (pair |> select(1:2) |> as.character()) ) } - couples - } } \examples{ - - tidybulk::se_mini |> identify_abundant() |> remove_redundancy( diff --git a/man/rename-methods.Rd b/man/rename-methods.Rd deleted file mode 100644 index db4defc0..00000000 --- a/man/rename-methods.Rd +++ /dev/null @@ -1,51 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{rename} -\alias{rename} -\title{Rename columns} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-select`][dplyr_tidy_select]> Use `new_name = old_name` -to rename selected variables.} -} -\value{ -An object of the same type as `.data`. -* Rows are not affected. -* Column names are changed; column order is preserved -* Data frame attributes are preserved. -* Groups are updated to reflect new names. -} -\description{ -Rename individual variables using `new_name = old_name` syntax. -} -\section{Scoped selection and renaming}{ - - -Use the three scoped variants ([rename_all()], [rename_if()], [rename_at()]) -to renaming a set of variables with a function. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -iris <- as_tibble(iris) # so it prints a little nicer -rename(iris, petal_length = Petal.Length) - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{summarise}()} -} -\concept{single table verbs} diff --git a/man/rename.Rd b/man/rename.Rd new file mode 100644 index 00000000..3d593bda --- /dev/null +++ b/man/rename.Rd @@ -0,0 +1,87 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{rename} +\alias{rename} +\alias{rename.tidybulk} +\title{Rename columns} +\usage{ +\method{rename}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{For \code{rename()}: <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Use +\code{new_name = old_name} to rename selected variables. + +For \code{rename_with()}: additional arguments passed onto \code{.fn}.} +} +\value{ +An object of the same type as \code{.data}. The output has the following +properties: +\itemize{ +\item Rows are not affected. +\item Column names are changed; column order is preserved. +\item Data frame attributes are preserved. +\item Groups are updated to reflect new names. +} +} +\description{ +\code{rename()} changes the names of individual variables using +\code{new_name = old_name} syntax; \code{rename_with()} renames columns using a +function. +} +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("rename")}. + +} + +\examples{ +iris <- as_tibble(iris) # so it prints a little nicer +rename(iris, petal_length = Petal.Length) + +# Rename using a named vector and `all_of()` +lookup <- c(pl = "Petal.Length", sl = "Sepal.Length") +rename(iris, all_of(lookup)) + +# If your named vector might contain names that don't exist in the data, +# use `any_of()` instead +lookup <- c(lookup, new = "unknown") +try(rename(iris, all_of(lookup))) +rename(iris, any_of(lookup)) + +rename_with(iris, toupper) +rename_with(iris, toupper, starts_with("Petal")) +rename_with(iris, ~ tolower(gsub(".", "_", .x, fixed = TRUE))) + +\dontshow{if (getRversion() > "4.0.1") (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# If your renaming function uses `paste0()`, make sure to set +# `recycle0 = TRUE` to ensure that empty selections are recycled correctly +try(rename_with( + iris, + ~ paste0("prefix_", .x), + starts_with("nonexistent") +)) + +rename_with( + iris, + ~ paste0("prefix_", .x, recycle0 = TRUE), + starts_with("nonexistent") +) +\dontshow{\}) # examplesIf} +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{mutate}()}, +\code{\link{summarise}()} +} +\concept{single table verbs} diff --git a/man/right_join.Rd b/man/right_join.Rd new file mode 100644 index 00000000..88594f4f --- /dev/null +++ b/man/right_join.Rd @@ -0,0 +1,172 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{right_join} +\alias{right_join} +\alias{right_join.tidybulk} +\title{Mutating joins} +\usage{ +\method{right_join}{tidybulk}(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) +} +\arguments{ +\item{x, y}{A pair of data frames, data frame extensions (e.g. a tibble), or +lazy data frames (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{by}{A join specification created with \code{\link[dplyr:join_by]{join_by()}}, or a character +vector of variables to join by. + +If \code{NULL}, the default, \verb{*_join()} will perform a natural join, using all +variables in common across \code{x} and \code{y}. A message lists the variables so +that you can check they're correct; suppress the message by supplying \code{by} +explicitly. + +To join on different variables between \code{x} and \code{y}, use a \code{\link[dplyr:join_by]{join_by()}} +specification. For example, \code{join_by(a == b)} will match \code{x$a} to \code{y$b}. + +To join by multiple variables, use a \code{\link[dplyr:join_by]{join_by()}} specification with +multiple expressions. For example, \code{join_by(a == b, c == d)} will match +\code{x$a} to \code{y$b} and \code{x$c} to \code{y$d}. If the column names are the same between +\code{x} and \code{y}, you can shorten this by listing only the variable names, like +\code{join_by(a, c)}. + +\code{\link[dplyr:join_by]{join_by()}} can also be used to perform inequality, rolling, and overlap +joins. See the documentation at \link[dplyr:join_by]{?join_by} for details on +these types of joins. + +For simple equality joins, you can alternatively specify a character vector +of variable names to join by. For example, \code{by = c("a", "b")} joins \code{x$a} +to \code{y$a} and \code{x$b} to \code{y$b}. If variable names differ between \code{x} and \code{y}, +use a named character vector like \code{by = c("x_a" = "y_a", "x_b" = "y_b")}. + +To perform a cross-join, generating all combinations of \code{x} and \code{y}, see +\code{\link[dplyr:cross_join]{cross_join()}}.} + +\item{copy}{If \code{x} and \code{y} are not from the same data source, +and \code{copy} is \code{TRUE}, then \code{y} will be copied into the +same src as \code{x}. This allows you to join tables across srcs, but +it is a potentially expensive operation so you must opt into it.} + +\item{suffix}{If there are non-joined duplicate variables in \code{x} and +\code{y}, these suffixes will be added to the output to disambiguate them. +Should be a character vector of length 2.} + +\item{...}{Other parameters passed onto methods.} +} +\value{ +An object of the same type as \code{x} (including the same groups). The order of +the rows and columns of \code{x} is preserved as much as possible. The output has +the following properties: +\itemize{ +\item The rows are affect by the join type. +\itemize{ +\item \code{inner_join()} returns matched \code{x} rows. +\item \code{left_join()} returns all \code{x} rows. +\item \code{right_join()} returns matched of \code{x} rows, followed by unmatched \code{y} rows. +\item \code{full_join()} returns all \code{x} rows, followed by unmatched \code{y} rows. +} +\item Output columns include all columns from \code{x} and all non-key columns from +\code{y}. If \code{keep = TRUE}, the key columns from \code{y} are included as well. +\item If non-key columns in \code{x} and \code{y} have the same name, \code{suffix}es are added +to disambiguate. If \code{keep = TRUE} and key columns in \code{x} and \code{y} have +the same name, \code{suffix}es are added to disambiguate these as well. +\item If \code{keep = FALSE}, output columns included in \code{by} are coerced to their +common type between \code{x} and \code{y}. +} +} +\description{ +Mutating joins add columns from \code{y} to \code{x}, matching observations based on +the keys. There are four mutating joins: the inner join, and the three outer +joins. +\subsection{Inner join}{ + +An \code{inner_join()} only keeps observations from \code{x} that have a matching key +in \code{y}. + +The most important property of an inner join is that unmatched rows in either +input are not included in the result. This means that generally inner joins +are not appropriate in most analyses, because it is too easy to lose +observations. +} + +\subsection{Outer joins}{ + +The three outer joins keep observations that appear in at least one of the +data frames: +\itemize{ +\item A \code{left_join()} keeps all observations in \code{x}. +\item A \code{right_join()} keeps all observations in \code{y}. +\item A \code{full_join()} keeps all observations in \code{x} and \code{y}. +} +} +} +\section{Many-to-many relationships}{ + + + +By default, dplyr guards against many-to-many relationships in equality joins +by throwing a warning. These occur when both of the following are true: +\itemize{ +\item A row in \code{x} matches multiple rows in \code{y}. +\item A row in \code{y} matches multiple rows in \code{x}. +} + +This is typically surprising, as most joins involve a relationship of +one-to-one, one-to-many, or many-to-one, and is often the result of an +improperly specified join. Many-to-many relationships are particularly +problematic because they can result in a Cartesian explosion of the number of +rows returned from the join. + +If a many-to-many relationship is expected, silence this warning by +explicitly setting \code{relationship = "many-to-many"}. + +In production code, it is best to preemptively set \code{relationship} to whatever +relationship you expect to exist between the keys of \code{x} and \code{y}, as this +forces an error to occur immediately if the data doesn't align with your +expectations. + +Inequality joins typically result in many-to-many relationships by nature, so +they don't warn on them by default, but you should still take extra care when +specifying an inequality join, because they also have the capability to +return a large number of rows. + +Rolling joins don't warn on many-to-many relationships either, but many +rolling joins follow a many-to-one relationship, so it is often useful to +set \code{relationship = "many-to-one"} to enforce this. + +Note that in SQL, most database providers won't let you specify a +many-to-many relationship between two tables, instead requiring that you +create a third \emph{junction table} that results in two one-to-many relationships +instead. + +} + +\section{Methods}{ + + +These functions are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{inner_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("inner_join")}. +\item \code{left_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("left_join")}. +\item \code{right_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("right_join")}. +\item \code{full_join()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("full_join")}. +} + +} + +\examples{ +data(se_mini) +annotation <- se_mini |> tidybulk() |> as_tibble() |> + distinct(.sample) |> mutate(source = "AU") +se_mini |> tidybulk() |> as_tibble() |> right_join(annotation) + +} +\seealso{ +Other joins: +\code{\link[dplyr]{cross_join}()}, +\code{\link[dplyr]{filter-joins}}, +\code{\link[dplyr]{nest_join}()} +} diff --git a/man/rotate_dimensions-methods.Rd b/man/rotate_dimensions-methods.Rd index c87c1cca..6d018171 100644 --- a/man/rotate_dimensions-methods.Rd +++ b/man/rotate_dimensions-methods.Rd @@ -83,7 +83,9 @@ rotate_dimensions( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{dimension_1_column}{A character string. The column of the dimension 1} @@ -93,29 +95,48 @@ rotate_dimensions( \item{.element}{The name of the element column (normally samples).} -\item{of_samples}{A boolean. In case the input is a tidybulk object, it indicates Whether the element column will be sample or transcript column} +\item{of_samples}{A boolean. In case the input is a tidybulk object, +it indicates Whether the element column will be sample or transcript column} -\item{dimension_1_column_rotated}{A character string. The column of the rotated dimension 1 (optional)} +\item{dimension_1_column_rotated}{A character string. The column of the +rotated dimension 1 (optional)} -\item{dimension_2_column_rotated}{A character string. The column of the rotated dimension 2 (optional)} +\item{dimension_2_column_rotated}{A character string. The column of the +rotated dimension 2 (optional)} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to the +input tbl (add), or just get the non-redundant tbl with the new information (get).} } \value{ -A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. - -A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. - -A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. - -A tbl object with additional columns for the reduced dimensions. additional columns for the rotated dimensions. The rotated dimensions will be added to the original data set as ` rotated ` by default, or as specified in the input arguments. +A tbl object with additional columns for the reduced dimensions. + additional columns for the rotated dimensions. The rotated dimensions will + be added to the original data set as ` rotated ` + by default, or as specified in the input arguments. + +A tbl object with additional columns for the reduced dimensions. + additional columns for the rotated dimensions. The rotated dimensions will + be added to the original data set as ` rotated ` + by default, or as specified in the input arguments. + +A tbl object with additional columns for the reduced dimensions. + additional columns for the rotated dimensions. The rotated dimensions will + be added to the original data set as ` rotated ` + by default, or as specified in the input arguments. + +A tbl object with additional columns for the reduced dimensions. + additional columns for the rotated dimensions. The rotated dimensions + will be added to the original data set as + ` rotated ` by default, or as + specified in the input arguments. A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -rotate_dimensions() takes as input a `tbl` formatted as | | | <...> | and calculates the rotated dimensional space of the transcript abundance. +rotate_dimensions() takes as input a `tbl` formatted as + | | | <...> | and calculates the rotated + dimensional space of the transcript abundance. } \details{ `r lifecycle::badge("maturing")` @@ -134,13 +155,12 @@ Underlying custom method: } } \examples{ - counts.MDS = tidybulk::se_mini |> identify_abundant() |> reduce_dimensions( method="MDS", .dims = 3) -counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, rotation_degrees = 45, .element = sample) - +counts.MDS.rotated = rotate_dimensions(counts.MDS, `Dim1`, `Dim2`, + rotation_degrees = 45, .element = sample) } diff --git a/man/rowwise-methods.Rd b/man/rowwise-methods.Rd deleted file mode 100644 index 3ff5fecf..00000000 --- a/man/rowwise-methods.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{rowwise} -\alias{rowwise} -\title{Group input by rows} -\arguments{ -\item{data}{Input data frame.} - -\item{...}{Variables to be preserved when calling summarise(). This is typically a set of variables whose combination uniquely identify each row. NB: unlike group_by() you can not create new variables here but instead you can select multiple variables with (e.g.) everything().} -} -\value{ -A consistent object (to the input) - - A `tbl` -} -\description{ -See [this repository](https://github.com/jennybc/row-oriented-workflows) -for alternative ways to perform row-wise operations. -} -\details{ -`rowwise()` is used for the results of [do()] when you -create list-variables. It is also useful to support arbitrary -complex operations that need to be applied to each row. - -Currently, rowwise grouping only works with data frames. Its -main impact is to allow you to work with list-variables in -[summarise()] and [mutate()] without having to -use \code{[[1]]}. This makes `summarise()` on a rowwise tbl -effectively equivalent to [plyr::ldply()]. -} -\examples{ - -df <- expand.grid(x = 1:3, y = 3:1) -df_done <- df |> rowwise() - -} diff --git a/man/rowwise.Rd b/man/rowwise.Rd new file mode 100644 index 00000000..28d150da --- /dev/null +++ b/man/rowwise.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{rowwise} +\alias{rowwise} +\alias{rowwise.tidybulk} +\title{Group input by rows} +\usage{ +\method{rowwise}{tidybulk}(data, ...) +} +\arguments{ +\item{data}{Input data frame.} + +\item{...}{<\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> Variables to be preserved +when calling \code{\link[dplyr:summarise]{summarise()}}. This is typically a set of variables whose +combination uniquely identify each row. + +\strong{NB}: unlike \code{group_by()} you can not create new variables here but +instead you can select multiple variables with (e.g.) \code{everything()}.} +} +\value{ +A row-wise data frame with class \code{rowwise_df}. Note that a +\code{rowwise_df} is implicitly grouped by row, but is not a \code{grouped_df}. +} +\description{ +\code{rowwise()} allows you to compute on a data frame a row-at-a-time. +This is most useful when a vectorised function doesn't exist. + +Most dplyr verbs preserve row-wise grouping. The exception is \code{\link[dplyr:summarise]{summarise()}}, +which return a \link[dplyr]{grouped_df}. You can explicitly ungroup with \code{\link[dplyr:ungroup]{ungroup()}} +or \code{\link[dplyr:as_tibble]{as_tibble()}}, or convert to a \link[dplyr]{grouped_df} with \code{\link[dplyr:group_by]{group_by()}}. +} +\section{List-columns}{ + + +Because a rowwise has exactly one row per group it offers a small +convenience for working with list-columns. Normally, \code{summarise()} and +\code{mutate()} extract a groups worth of data with \code{[}. But when you index +a list in this way, you get back another list. When you're working with +a \code{rowwise} tibble, then dplyr will use \code{[[} instead of \code{[} to make your +life a little easier. + +} + +\examples{ +df <- tibble(x = runif(6), y = runif(6), z = runif(6)) +# Compute the mean of x, y, z in each row +df \%>\% rowwise() \%>\% mutate(m = mean(c(x, y, z))) +# use c_across() to more easily select many variables +df \%>\% rowwise() \%>\% mutate(m = mean(c_across(x:z))) + +# Compute the minimum of x and y in each row +df \%>\% rowwise() \%>\% mutate(m = min(c(x, y, z))) +# In this case you can use an existing vectorised function: +df \%>\% mutate(m = pmin(x, y, z)) +# Where these functions exist they'll be much faster than rowwise +# so be on the lookout for them. + +# rowwise() is also useful when doing simulations +params <- tribble( + ~sim, ~n, ~mean, ~sd, + 1, 1, 1, 1, + 2, 2, 2, 4, + 3, 3, -1, 2 +) +# Here I supply variables to preserve after the computation +params \%>\% + rowwise(sim) \%>\% + reframe(z = rnorm(n, mean, sd)) + +# If you want one row per simulation, put the results in a list() +params \%>\% + rowwise(sim) \%>\% + summarise(z = list(rnorm(n, mean, sd)), .groups = "keep") +} +\seealso{ +\code{\link[dplyr:nest_by]{nest_by()}} for a convenient way of creating rowwise data frames +with nested data. +} diff --git a/man/scale_abundance-methods.Rd b/man/scale_abundance-methods.Rd index 21cd6476..42956d65 100644 --- a/man/scale_abundance-methods.Rd +++ b/man/scale_abundance-methods.Rd @@ -83,7 +83,10 @@ scale_abundance( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, +feature and transcript abundance) or `SummarizedExperiment` +(more convenient if abstracted to tibble with +library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -91,51 +94,61 @@ scale_abundance( \item{.abundance}{The name of the transcript/gene abundance column} -\item{method}{A character string. The scaling method passed to the back-end function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")} +\item{method}{A character string. The scaling method passed to the back-end +function (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile")} -\item{reference_sample}{A character string. The name of the reference sample. If NULL the sample with highest total read count will be selected as reference.} +\item{reference_sample}{A character string. The name of the reference sample. +If NULL the sample with highest total read count will be selected as reference.} -\item{.subset_for_scaling}{A gene-wise quosure condition. This will be used to filter rows (features/genes) of the dataset. For example} +\item{.subset_for_scaling}{A gene-wise quosure condition. This will be used +to filter rows (features/genes) of the dataset. For example} -\item{action}{A character string between "add" (default) and "only". "add" joins the new information to the input tbl (default), "only" return a non-redundant tbl with the just new information.} +\item{action}{A character string between "add" (default) and "only". +"add" joins the new information to the input tbl (default), "only" +return a non-redundant tbl with the just new information.} \item{reference_selection_function}{DEPRECATED. please use reference_sample.} } \value{ -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` -A tbl object with additional columns with scaled data as `_scaled` +A tbl object with additional columns with scaled data +as `_scaled` A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -scale_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and Scales transcript abundance compansating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). +scale_abundance() takes as input A `tbl` (with at least three + columns for sample, feature and transcript abundance) or + `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) and Scales transcript abundance + compansating for sequencing depth (e.g., with TMM algorithm, + Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). } \details{ `r lifecycle::badge("maturing")` Scales transcript abundance compensating for sequencing depth (e.g., with TMM algorithm, Robinson and Oshlack doi.org/10.1186/gb-2010-11-3-r25). -Lowly transcribed transcripts/genes (defined with minimum_counts and minimum_proportion parameters) -are filtered out from the scaling procedure. +Lowly transcribed transcripts/genes (defined with minimum_counts +and minimum_proportion parameters) are filtered out from the scaling procedure. The scaling inference is then applied back to all unfiltered data. Underlying method edgeR::calcNormFactors(.data, method = c("TMM","TMMwsp","RLE","upperquartile")) } \examples{ - - tidybulk::se_mini |> identify_abundant() |> scale_abundance() - - } diff --git a/man/summarise-methods.Rd b/man/summarise-methods.Rd deleted file mode 100644 index 56907289..00000000 --- a/man/summarise-methods.Rd +++ /dev/null @@ -1,91 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/dplyr_methods.R -\name{summarise} -\alias{summarise} -\title{Summarise each group to fewer rows} -\arguments{ -\item{.data}{A tbl. (See dplyr)} - -\item{...}{<[`tidy-eval`][dplyr_tidy_eval]> Name-value pairs of summary - functions. The name will be the name of the variable in the result. - - The value can be: - - * A vector of length 1, e.g. `min(x)`, `n()`, or `sum(is.na(y))`. - * A vector of length `n`, e.g. `quantile()`. - * A data frame, to add multiple columns from a single expression.} -} -\value{ -An object _usually_ of the same type as `.data`. - -* The rows come from the underlying `group_keys()`. -* The columns are a combination of the grouping keys and the summary - expressions that you provide. -* If `x` is grouped by more than one variable, the output will be another - [grouped_df] with the right-most group removed. -* If `x` is grouped by one variable, or is not grouped, the output will - be a [tibble]. -* Data frame attributes are **not** preserved, because `summarise()` - fundamentally creates a new data frame. -} -\description{ -`summarise()` creates a new data frame. It will have one (or more) rows for -each combination of grouping variables; if there are no grouping variables, -the output will have a single row summarising all observations in the input. -It will contain one column for each grouping variable and one column -for each of the summary statistics that you have specified. - -`summarise()` and `summarize()` are synonyms. -} -\section{Useful functions}{ - - -* Center: [mean()], [median()] -* Spread: [sd()], [IQR()], [mad()] -* Range: [min()], [max()], [quantile()] -* Position: [first()], [last()], [nth()], -* Count: [n()], [n_distinct()] -* Logical: [any()], [all()] -} - -\section{Backend variations}{ - - -The data frame backend supports creating a variable and using it in the -same summary. This means that previously created summary variables can be -further transformed or combined within the summary, as in [mutate()]. -However, it also means that summary variables with the same names as previous -variables overwrite them, making those variables unavailable to later summary -variables. - -This behaviour may not be supported in other backends. To avoid unexpected -results, consider using new names for your summary variables, especially when -creating multiple summaries. -} - -\section{Methods}{ - -This function is a **generic**, which means that packages can provide -implementations (methods) for other classes. See the documentation of -individual methods for extra arguments and differences in behaviour. - -The following methods are currently available in loaded packages: -} - -\examples{ - -# A summary applied to ungrouped tbl returns a single row - -mtcars |> - summarise(mean = mean(disp)) - - -} -\seealso{ -Other single table verbs: -\code{\link{arrange}()}, -\code{\link{filter}()}, -\code{\link{mutate}()}, -\code{\link{rename}()} -} -\concept{single table verbs} diff --git a/man/summarise.Rd b/man/summarise.Rd new file mode 100644 index 00000000..cf23fdc1 --- /dev/null +++ b/man/summarise.Rd @@ -0,0 +1,139 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{summarise} +\alias{summarise} +\alias{summarise.tidybulk} +\alias{summarize} +\alias{summarize.tidybulk} +\title{Summarise each group down to one row} +\usage{ +\method{summarise}{tidybulk}(.data, ...) + +\method{summarize}{tidybulk}(.data, ...) +} +\arguments{ +\item{.data}{A data frame, data frame extension (e.g. a tibble), or a +lazy data frame (e.g. from dbplyr or dtplyr). See \emph{Methods}, below, for +more details.} + +\item{...}{<\code{\link[rlang:args_data_masking]{data-masking}}> Name-value pairs of +summary functions. The name will be the name of the variable in the result. + +The value can be: +\itemize{ +\item A vector of length 1, e.g. \code{min(x)}, \code{n()}, or \code{sum(is.na(y))}. +\item A data frame, to add multiple columns from a single expression. +} + +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Returning values with size 0 or >1 was +deprecated as of 1.1.0. Please use \code{\link[dplyr:reframe]{reframe()}} for this instead.} +} +\value{ +An object \emph{usually} of the same type as \code{.data}. +\itemize{ +\item The rows come from the underlying \code{\link[dplyr:group_keys]{group_keys()}}. +\item The columns are a combination of the grouping keys and the summary +expressions that you provide. +\item The grouping structure is controlled by the \verb{.groups=} argument, the +output may be another \link[dplyr]{grouped_df}, a \link[dplyr]{tibble} or a \link[dplyr]{rowwise} data frame. +\item Data frame attributes are \strong{not} preserved, because \code{summarise()} +fundamentally creates a new data frame. +} +} +\description{ +\code{summarise()} creates a new data frame. It returns one row for each +combination of grouping variables; if there are no grouping variables, the +output will have a single row summarising all observations in the input. It +will contain one column for each grouping variable and one column for each of +the summary statistics that you have specified. + +\code{summarise()} and \code{summarize()} are synonyms. +} +\section{Useful functions}{ + + +\itemize{ +\item Center: \code{\link[=mean]{mean()}}, \code{\link[=median]{median()}} +\item Spread: \code{\link[=sd]{sd()}}, \code{\link[=IQR]{IQR()}}, \code{\link[=mad]{mad()}} +\item Range: \code{\link[=min]{min()}}, \code{\link[=max]{max()}}, +\item Position: \code{\link[dplyr:first]{first()}}, \code{\link[dplyr:last]{last()}}, \code{\link[dplyr:nth]{nth()}}, +\item Count: \code{\link[dplyr:n]{n()}}, \code{\link[dplyr:n_distinct]{n_distinct()}} +\item Logical: \code{\link[=any]{any()}}, \code{\link[=all]{all()}} +} + +} + +\section{Backend variations}{ + + + +The data frame backend supports creating a variable and using it in the +same summary. This means that previously created summary variables can be +further transformed or combined within the summary, as in \code{\link[dplyr:mutate]{mutate()}}. +However, it also means that summary variables with the same names as previous +variables overwrite them, making those variables unavailable to later summary +variables. + +This behaviour may not be supported in other backends. To avoid unexpected +results, consider using new names for your summary variables, especially when +creating multiple summaries. + +} + +\section{Methods}{ + + +This function is a \strong{generic}, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +The following methods are currently available in loaded packages: +\Sexpr[stage=render,results=rd]{dplyr:::methods_rd("summarise")}. + +} + +\examples{ +# A summary applied to ungrouped tbl returns a single row +mtcars \%>\% + summarise(mean = mean(disp), n = n()) + +# Usually, you'll want to group first +mtcars \%>\% + group_by(cyl) \%>\% + summarise(mean = mean(disp), n = n()) + +# Each summary call removes one grouping level (since that group +# is now just a single row) +mtcars \%>\% + group_by(cyl, vs) \%>\% + summarise(cyl_n = n()) \%>\% + group_vars() + +# BEWARE: reusing variables may lead to unexpected results +mtcars \%>\% + group_by(cyl) \%>\% + summarise(disp = mean(disp), sd = sd(disp)) + +# Refer to column names stored as strings with the `.data` pronoun: +var <- "mass" +summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) +# Learn more in ?rlang::args_data_masking + +# In dplyr 1.1.0, returning multiple rows per group was deprecated in favor +# of `reframe()`, which never messages and always returns an ungrouped +# result: +mtcars \%>\% + group_by(cyl) \%>\% + summarise(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) +# -> +mtcars \%>\% + group_by(cyl) \%>\% + reframe(qs = quantile(disp, c(0.25, 0.75)), prob = c(0.25, 0.75)) +} +\seealso{ +Other single table verbs: +\code{\link{arrange}()}, +\code{\link{mutate}()}, +\code{\link{rename}()} +} +\concept{single table verbs} diff --git a/man/symbol_to_entrez.Rd b/man/symbol_to_entrez.Rd index f02ca98d..e6f0b591 100644 --- a/man/symbol_to_entrez.Rd +++ b/man/symbol_to_entrez.Rd @@ -24,6 +24,7 @@ Get ENTREZ id from gene SYMBOL # This function was designed for data.frame # Convert from SummarizedExperiment for this example. It is NOT reccomended. -tidybulk::se_mini |> tidybulk() |> as_tibble() |> symbol_to_entrez(.transcript = .feature, .sample = .sample) +tidybulk::se_mini |> tidybulk() |> as_tibble() |> +symbol_to_entrez(.transcript = .feature, .sample = .sample) } diff --git a/man/test_differential_abundance-methods.Rd b/man/test_differential_abundance-methods.Rd index d8beb5cf..21bd0246 100755 --- a/man/test_differential_abundance-methods.Rd +++ b/man/test_differential_abundance-methods.Rd @@ -8,7 +8,9 @@ \alias{test_differential_abundance,tidybulk-method} \alias{test_differential_abundance,SummarizedExperiment-method} \alias{test_differential_abundance,RangedSummarizedExperiment-method} -\title{Perform differential transcription testing using edgeR quasi-likelihood (QLT), edgeR likelihood-ratio (LR), limma-voom, limma-voom-with-quality-weights or DESeq2} +\title{Perform differential transcription testing using edgeR quasi-likelihood +(QLT), edgeR likelihood-ratio (LR), limma-voom, +limma-voom-with-quality-weights or DESeq2} \usage{ test_differential_abundance( .data, @@ -125,9 +127,13 @@ test_differential_abundance( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula representing the desired linear model. If there is more than one factor, they should be in the order factor of interest + additional factors.} +\item{.formula}{A formula representing the desired linear model. If there +is more than one factor, they should be in the order factor of interest + +additional factors.} \item{.sample}{The name of the sample column} @@ -135,49 +141,87 @@ test_differential_abundance( \item{.abundance}{The name of the transcript/gene abundance column} -\item{contrasts}{This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} +\item{contrasts}{This parameter takes the format of the contrast parameter +of the method of choice. For edgeR and limma-voom is a character vector. +For DESeq2 is a list including a character vector of length three. The first +covariate is the one the model is tested against (e.g., ~ factor_of_interest)} -\item{method}{A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), "edgeR_likelihood_ratio" (i.e., LRT), "edger_robust_likelihood_ratio", "DESeq2", "limma_voom", "limma_voom_sample_weights", "glmmseq_lme4", "glmmseq_glmmtmb"} +\item{method}{A string character. Either "edgeR_quasi_likelihood" (i.e., QLF), +"edgeR_likelihood_ratio" (i.e., LRT), "edger_robust_likelihood_ratio", +"DESeq2", "limma_voom", "limma_voom_sample_weights", "glmmseq_lme4", "glmmseq_glmmtmb"} -\item{test_above_log2_fold_change}{A positive real value. This works for edgeR and limma_voom methods. It uses the `treat` function, which tests that the difference in abundance is bigger than this threshold rather than zero \url{https://pubmed.ncbi.nlm.nih.gov/19176553}.} +\item{test_above_log2_fold_change}{A positive real value. This works for edgeR +and limma_voom methods. It uses the `treat` function, which tests that the +difference in abundance is bigger than this threshold rather than zero +\url{https://pubmed.ncbi.nlm.nih.gov/19176553}.} -\item{scaling_method}{A character string. The scaling method passed to the back-end functions: edgeR and limma-voom (i.e., edgeR::calcNormFactors; "TMM","TMMwsp","RLE","upperquartile"). Setting the parameter to \"none\" will skip the compensation for sequencing-depth for the method edgeR or limma-voom.} +\item{scaling_method}{A character string. The scaling method passed to the +back-end functions: edgeR and limma-voom (i.e., edgeR::calcNormFactors; +"TMM","TMMwsp","RLE","upperquartile"). Setting the parameter to \"none\" +will skip the compensation for sequencing-depth for the method edgeR or limma-voom.} -\item{omit_contrast_in_colnames}{If just one contrast is specified you can choose to omit the contrast label in the colnames.} +\item{omit_contrast_in_colnames}{If just one contrast is specified you can +choose to omit the contrast label in the colnames.} -\item{prefix}{A character string. The prefix you would like to add to the result columns. It is useful if you want to compare several methods.} +\item{prefix}{A character string. The prefix you would like to add to the +result columns. It is useful if you want to compare several methods.} -\item{action}{A character string. Whether to join the new information to the input tbl (add), or just get the non-redundant tbl with the new information (get).} +\item{action}{A character string. Whether to join the new information to the +input tbl (add), or just get the non-redundant tbl with the new information (get).} -\item{...}{Further arguments passed to some of the internal experimental functions. For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. .scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance.} +\item{...}{Further arguments passed to some of the internal experimental functions. +For example for glmmSeq, it is possible to pass .dispersion, and .scaling_factor +column tidyeval to skip the caluclation of dispersion and scaling and use precalculated values. +This is helpful is you want to calculate those quantities on many genes and do DE testing on fewer genes. +.scaling_factor is the TMM value that can be obtained with tidybulk::scale_abundance.} \item{significance_threshold}{DEPRECATED - A real between 0 and 1 (usually 0.05).} -\item{fill_missing_values}{DEPRECATED - A boolean. Whether to fill missing sample/transcript values with the median of the transcript. This is rarely needed.} +\item{fill_missing_values}{DEPRECATED - A boolean. Whether to fill missing +sample/transcript values with the median of the transcript. This is rarely needed.} -\item{.contrasts}{DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} +\item{.contrasts}{DEPRECATED - This parameter takes the format of the +contrast parameter of the method of choice. For edgeR and limma-voom is a +character vector. For DESeq2 is a list including a character vector of length +three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} } \value{ -A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the test (e.g., log fold change, p-value and false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the test (e.g., log fold change, p-value and false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -test_differential_abundance() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +test_differential_abundance() takes as input A `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object (to the +input) with additional columns for the statistics from the hypothesis test. } \details{ `r lifecycle::badge("maturing")` -This function provides the option to use edgeR \url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom \url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights \url{https://doi.org/10.1093/nar/gkv412} or DESeq2 \url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. -All methods use raw counts, irrespective of if scale_abundance or adjust_abundance have been calculated, therefore it is essential to add covariates such as batch effects (if applicable) in the formula. +This function provides the option to use edgeR +\url{https://doi.org/10.1093/bioinformatics/btp616}, limma-voom +\url{https://doi.org/10.1186/gb-2014-15-2-r29}, limma_voom_sample_weights +\url{https://doi.org/10.1093/nar/gkv412} or DESeq2 +\url{https://doi.org/10.1186/s13059-014-0550-8} to perform the testing. +All methods use raw counts, irrespective of if scale_abundance or +adjust_abundance have been calculated, therefore it is essential to add +covariates such as batch effects (if applicable) in the formula. Underlying method for edgeR framework: diff --git a/man/test_differential_cellularity-methods.Rd b/man/test_differential_cellularity-methods.Rd index e706193e..a5768c47 100644 --- a/man/test_differential_cellularity-methods.Rd +++ b/man/test_differential_cellularity-methods.Rd @@ -83,9 +83,15 @@ test_differential_cellularity( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula representing the desired linear model. The formula can be of two forms: multivariable (recommended) or univariable Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". The dot represents cell-type proportions, and it is mandatory. If censored regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\"} +\item{.formula}{A formula representing the desired linear model. The formula +can be of two forms: multivariable (recommended) or univariable +Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". +The dot represents cell-type proportions, and it is mandatory. If censored +regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\"} \item{.sample}{The name of the sample column} @@ -93,29 +99,41 @@ test_differential_cellularity( \item{.abundance}{The name of the transcript/gene abundance column} -\item{method}{A string character. Either \"cibersort\", \"epic\" or \"llsr\". The regression method will be chosen based on being multivariable: lm or cox-regression (both on logit-transformed proportions); or univariable: beta or cox-regression (on logit-transformed proportions). See .formula for multi- or univariable choice.} +\item{method}{A string character. Either \"cibersort\", \"epic\" or \"llsr\". +The regression method will be chosen based on being multivariable: lm or +cox-regression (both on logit-transformed proportions); or univariable: +beta or cox-regression (on logit-transformed proportions). See .formula +for multi- or univariable choice.} -\item{reference}{A data frame. The transcript/cell_type data frame of integer transcript abundance} +\item{reference}{A data frame. The transcript/cell_type data frame of +integer transcript abundance} \item{significance_threshold}{A real between 0 and 1 (usually 0.05).} \item{...}{Further parameters passed to the method deconvolve_cellularity} } \value{ -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for +the statistics from the hypothesis test (e.g., log fold change, p-value +and false discovery rate). A `SummarizedExperiment` object A `SummarizedExperiment` object } \description{ -test_differential_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +test_differential_cellularity() takes as input A `tbl` (with at +least three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional columns for the statistics from the hypothesis test. } \details{ `r lifecycle::badge("maturing")` This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) -and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) +and passes the proportions inferred into a generalised linear model +(DOI:dx.doi.org/10.1007/s11749-010-0189-z) or a cox regression model (ISBN: 978-1-4757-3294-8) Underlying method for the generalised linear model: diff --git a/man/test_gene_enrichment-methods.Rd b/man/test_gene_enrichment-methods.Rd index b21525d8..b790d115 100644 --- a/man/test_gene_enrichment-methods.Rd +++ b/man/test_gene_enrichment-methods.Rd @@ -107,9 +107,12 @@ test_gene_enrichment( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula with no response variable, representing the desired linear model} +\item{.formula}{A formula with no response variable, representing the +desired linear model} \item{.sample}{The name of the sample column} @@ -117,11 +120,22 @@ test_gene_enrichment( \item{.abundance}{The name of the transcript/gene abundance column} -\item{contrasts}{This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} +\item{contrasts}{This parameter takes the format of the contrast parameter +of the method of choice. For edgeR and limma-voom is a character vector. +For DESeq2 is a list including a character vector of length three. The first +covariate is the one the model is tested against (e.g., ~ factor_of_interest)} -\item{methods}{A character vector. One or 3 or more methods to use in the testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() to see the supported GSE methods.} +\item{methods}{A character vector. One or 3 or more methods to use in the +testing (currently EGSEA errors if 2 are used). Type EGSEA::egsea.base() to +see the supported GSE methods.} -\item{gene_sets}{A character vector or a list. It can take one or more of the following built-in collections as a character vector: c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. Alternatively, a list of user-supplied gene sets can be provided, to be used with EGSEA buildCustomIdx. In that case, each gene set is a character vector of Entrez IDs and the names of the list are the gene set names.} +\item{gene_sets}{A character vector or a list. It can take one or more of +the following built-in collections as a character vector: c("h", "c1", "c2", +"c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", +"kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. +Alternatively, a list of user-supplied gene sets can be provided, to be used +with EGSEA buildCustomIdx. In that case, each gene set is a character vector +of Entrez IDs and the names of the list are the gene set names.} \item{species}{A character. It can be human, mouse or rat.} @@ -129,7 +143,10 @@ test_gene_enrichment( \item{method}{DEPRECATED. Please use methods.} -\item{.contrasts}{DEPRECATED - This parameter takes the format of the contrast parameter of the method of choice. For edgeR and limma-voom is a character vector. For DESeq2 is a list including a character vector of length three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} +\item{.contrasts}{DEPRECATED - This parameter takes the format of the +contrast parameter of the method of choice. For edgeR and limma-voom is a +character vector. For DESeq2 is a list including a character vector of length +three. The first covariate is the one the model is tested against (e.g., ~ factor_of_interest)} } \value{ A consistent object (to the input) @@ -145,13 +162,16 @@ A consistent object (to the input) A consistent object (to the input) } \description{ -test_gene_enrichment() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` of gene set information +test_gene_enrichment() takes as input a `tbl` (with at least +three columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` of gene set information } \details{ `r lifecycle::badge("maturing")` -This wrapper executes ensemble gene enrichment analyses of the dataset using EGSEA (DOI:0.12688/f1000research.12544.1) - +This wrapper executes ensemble gene enrichment analyses of the +dataset using EGSEA (DOI:0.12688/f1000research.12544.1) dge = data |> @@ -200,7 +220,8 @@ library("EGSEA") .entrez = entrez, .abundance = count, methods = c("roast" , "safe", "gage" , "padog" , "globaltest", "ora" ), - gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), + gene_sets = c("h", "c1", "c2", "c3", "c4", "c5", "c6", + "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), species="human", cores = 2 ) diff --git a/man/test_gene_overrepresentation-methods.Rd b/man/test_gene_overrepresentation-methods.Rd index 74c98237..f679ec50 100644 --- a/man/test_gene_overrepresentation-methods.Rd +++ b/man/test_gene_overrepresentation-methods.Rd @@ -71,17 +71,22 @@ test_gene_overrepresentation( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.entrez}{The ENTREZ ID of the transcripts/genes} \item{.do_test}{A boolean column name symbol. It indicates the transcript to check} -\item{species}{A character. For example, human or mouse. MSigDB uses the latin species names (e.g., \"Mus musculus\", \"Homo sapiens\")} +\item{species}{A character. For example, human or mouse. MSigDB uses the +latin species names (e.g., \"Mus musculus\", \"Homo sapiens\")} \item{.sample}{The name of the sample column} -\item{gene_sets}{A character vector. The subset of MSigDB datasets you want to test against (e.g. \"C2\"). If NULL all gene sets are used (suggested). This argument was added to avoid time overflow of the examples.} +\item{gene_sets}{A character vector. The subset of MSigDB datasets you want +to test against (e.g. \"C2\"). If NULL all gene sets are used (suggested). +This argument was added to avoid time overflow of the examples.} \item{gene_set}{DEPRECATED. Use gene_sets instead.} } @@ -99,12 +104,16 @@ A `SummarizedExperiment` object A `RangedSummarizedExperiment` object } \description{ -test_gene_overrepresentation() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics +test_gene_overrepresentation() takes as input a `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics } \details{ `r lifecycle::badge("maturing")` -This wrapper execute gene enrichment analyses of the dataset using a list of transcripts and GSEA. +This wrapper execute gene enrichment analyses of the dataset using +a list of transcripts and GSEA. This wrapper uses clusterProfiler (DOI: doi.org/10.1089/omi.2011.0118) on the back-end. Undelying method: diff --git a/man/test_gene_rank-methods.Rd b/man/test_gene_rank-methods.Rd index dadabd6c..d9d854df 100644 --- a/man/test_gene_rank-methods.Rd +++ b/man/test_gene_rank-methods.Rd @@ -71,17 +71,26 @@ test_gene_rank( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.entrez}{The ENTREZ ID of the transcripts/genes} \item{.arrange_desc}{A column name of the column to arrange in decreasing order} -\item{species}{A character. For example, human or mouse. MSigDB uses the latin species names (e.g., \"Mus musculus\", \"Homo sapiens\")} +\item{species}{A character. For example, human or mouse. MSigDB uses the +latin species names (e.g., \"Mus musculus\", \"Homo sapiens\")} \item{.sample}{The name of the sample column} -\item{gene_sets}{A character vector or a list. It can take one or more of the following built-in collections as a character vector: c("h", "c1", "c2", "c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", "kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. Alternatively, a list of user-supplied gene sets can be provided, to be used with EGSEA buildCustomIdx. In that case, each gene set is a character vector of Entrez IDs and the names of the list are the gene set names.} +\item{gene_sets}{A character vector or a list. It can take one or more of +the following built-in collections as a character vector: c("h", "c1", "c2", +"c3", "c4", "c5", "c6", "c7", "kegg_disease", "kegg_metabolism", +"kegg_signaling"), to be used with EGSEA buildIdx. c1 is human specific. +Alternatively, a list of user-supplied gene sets can be provided, to be used +with EGSEA buildCustomIdx. In that case, each gene set is a character vector +of Entrez IDs and the names of the list are the gene set names.} \item{gene_set}{DEPRECATED. Use gene_sets instead.} } @@ -99,19 +108,24 @@ A `SummarizedExperiment` object A `RangedSummarizedExperiment` object } \description{ -test_gene_rank() takes as input a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics +test_gene_rank() takes as input a `tbl` (with at least three +columns for sample, feature and transcript abundance) or +`SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a `tbl` with the GSEA statistics } \details{ \lifecycle{maturing} -This wrapper execute gene enrichment analyses of the dataset using a list of transcripts and GSEA. +This wrapper execute gene enrichment analyses of the dataset using +a list of transcripts and GSEA. This wrapper uses clusterProfiler (DOI: doi.org/10.1089/omi.2011.0118) on the back-end. Undelying method: # Get gene sets signatures msigdbr::msigdbr(species = species) %>% -# Filter specific gene_sets if specified. This was introduced to speed up examples executionS +# Filter specific gene_sets if specified. This was introduced to speed up +examples executionS when( !is.null(gene_sets ) ~ filter(., gs_cat %in% gene_sets ), ~ (.) diff --git a/man/test_stratification_cellularity-methods.Rd b/man/test_stratification_cellularity-methods.Rd index be7e469d..4663b41e 100644 --- a/man/test_stratification_cellularity-methods.Rd +++ b/man/test_stratification_cellularity-methods.Rd @@ -8,7 +8,8 @@ \alias{test_stratification_cellularity,tidybulk-method} \alias{test_stratification_cellularity,SummarizedExperiment-method} \alias{test_stratification_cellularity,RangedSummarizedExperiment-method} -\title{Test of stratification of biological replicates based on tissue composition, one cell-type at the time, using Kaplan-meier curves.} +\title{Test of stratification of biological replicates based on tissue composition, +one cell-type at the time, using Kaplan-meier curves.} \usage{ test_stratification_cellularity( .data, @@ -77,9 +78,15 @@ test_stratification_cellularity( ) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature and +transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} -\item{.formula}{A formula representing the desired linear model. The formula can be of two forms: multivariable (recommended) or univariable Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". The dot represents cell-type proportions, and it is mandatory. If censored regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\"} +\item{.formula}{A formula representing the desired linear model. The formula +can be of two forms: multivariable (recommended) or univariable +Respectively: \"factor_of_interest ~ .\" or \". ~ factor_of_interest\". +The dot represents cell-type proportions, and it is mandatory. If censored +regression is desired (coxph) the formula should be of the form \"survival::Surv\(y, dead\) ~ .\"} \item{.sample}{The name of the sample column} @@ -87,27 +94,43 @@ test_stratification_cellularity( \item{.abundance}{The name of the transcript/gene abundance column} -\item{method}{A string character. Either \"cibersort\", \"epic\" or \"llsr\". The regression method will be chosen based on being multivariable: lm or cox-regression (both on logit-transformed proportions); or univariable: beta or cox-regression (on logit-transformed proportions). See .formula for multi- or univariable choice.} +\item{method}{A string character. Either \"cibersort\", \"epic\" or \"llsr\". +The regression method will be chosen based on being multivariable: lm or +cox-regression (both on logit-transformed proportions); or univariable: beta +or cox-regression (on logit-transformed proportions). See .formula for +multi- or univariable choice.} -\item{reference}{A data frame. The transcript/cell_type data frame of integer transcript abundance} +\item{reference}{A data frame. The transcript/cell_type data frame of integer +transcript abundance} \item{...}{Further parameters passed to the method deconvolve_cellularity} } \value{ -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). -A consistent object (to the input) with additional columns for the statistics from the hypothesis test (e.g., log fold change, p-value and false discovery rate). +A consistent object (to the input) with additional columns for the +statistics from the hypothesis test (e.g., log fold change, p-value and +false discovery rate). } \description{ -test_stratification_cellularity() takes as input A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) and returns a consistent object (to the input) with additional columns for the statistics from the hypothesis test. +test_stratification_cellularity() takes as input A `tbl` +(with at least three columns for sample, feature and transcript abundance) +or `SummarizedExperiment` (more convenient if abstracted to tibble with +library(tidySummarizedExperiment)) and returns a consistent object +(to the input) with additional columns for the statistics from the hypothesis test. } \details{ `r lifecycle::badge("maturing")` This routine applies a deconvolution method (e.g., Cibersort; DOI: 10.1038/nmeth.3337) -and passes the proportions inferred into a generalised linear model (DOI:dx.doi.org/10.1007/s11749-010-0189-z) +and passes the proportions inferred into a generalised linear model +(DOI:dx.doi.org/10.1007/s11749-010-0189-z) or a cox regression model (ISBN: 978-1-4757-3294-8) @@ -125,8 +148,6 @@ deconvolve_cellularity( survival::survdiff(data = data, .my_formula) } \examples{ - - tidybulk::se_mini |> test_stratification_cellularity( survival::Surv(days, dead) ~ ., diff --git a/man/tidybulk-methods.Rd b/man/tidybulk-methods.Rd index 0ef4e838..a6fba8f8 100644 --- a/man/tidybulk-methods.Rd +++ b/man/tidybulk-methods.Rd @@ -7,7 +7,8 @@ \alias{tidybulk,tbl_df-method} \alias{tidybulk,SummarizedExperiment-method} \alias{tidybulk,RangedSummarizedExperiment-method} -\title{Creates an annotated `tidybulk` tibble from a `tbl` or `SummarizedExperiment` object} +\title{Creates an annotated `tidybulk` tibble from a `tbl` or +`SummarizedExperiment` object} \usage{ tidybulk(.data, .sample, .transcript, .abundance, .abundance_scaled = NULL) @@ -20,7 +21,9 @@ tidybulk(.data, .sample, .transcript, .abundance, .abundance_scaled = NULL) \S4method{tidybulk}{RangedSummarizedExperiment}(.data, .sample, .transcript, .abundance, .abundance_scaled = NULL) } \arguments{ -\item{.data}{A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment))} +\item{.data}{A `tbl` (with at least three columns for sample, feature +and transcript abundance) or `SummarizedExperiment` (more convenient if +abstracted to tibble with library(tidySummarizedExperiment))} \item{.sample}{The name of the sample column} @@ -28,7 +31,8 @@ tidybulk(.data, .sample, .transcript, .abundance, .abundance_scaled = NULL) \item{.abundance}{The name of the transcript/gene abundance column} -\item{.abundance_scaled}{The name of the transcript/gene scaled abundance column} +\item{.abundance_scaled}{The name of the transcript/gene scaled +abundance column} } \value{ A `tidybulk` object @@ -42,7 +46,10 @@ A `tidybulk` object A `tidybulk` object } \description{ -tidybulk() creates an annotated `tidybulk` tibble from a `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +tidybulk() creates an annotated `tidybulk` tibble from a + `tbl` (with at least three columns for sample, feature and transcript + abundance) or `SummarizedExperiment` (more convenient if abstracted to + tibble with library(tidySummarizedExperiment)) } \details{ `r lifecycle::badge("maturing")` @@ -50,7 +57,8 @@ tidybulk() creates an annotated `tidybulk` tibble from a `tbl` (with at least th This function creates a tidybulk object and is useful if you want to avoid to specify .sample, .transcript and .abundance arguments all the times. The tidybulk object have an attribute called internals where these three -arguments are stored as metadata. They can be extracted as attr(, "internals"). +arguments are stored as metadata. They can be extracted as +attr(, "internals"). } \examples{ diff --git a/man/tidybulk_SAM_BAM-methods.Rd b/man/tidybulk_SAM_BAM-methods.Rd index 7087bddd..d7840e70 100644 --- a/man/tidybulk_SAM_BAM-methods.Rd +++ b/man/tidybulk_SAM_BAM-methods.Rd @@ -13,7 +13,9 @@ tidybulk_SAM_BAM(file_names, genome = "hg38", ...) \arguments{ \item{file_names}{A character vector} -\item{genome}{A character string specifying an in-built annotation used for read summarization. It has four possible values including "mm10", "mm9", "hg38" and "hg19"} +\item{genome}{A character string specifying an in-built annotation used for +read summarization. It has four possible values including "mm10", "mm9", +"hg38" and "hg19"} \item{...}{Further parameters passed to the function Rsubread::featureCounts} } @@ -23,15 +25,20 @@ A `tidybulk` object A `tidybulk` object } \description{ -tidybulk_SAM_BAM() creates a `tt` object from A `tbl` (with at least three columns for sample, feature and transcript abundance) or `SummarizedExperiment` (more convenient if abstracted to tibble with library(tidySummarizedExperiment)) +tidybulk_SAM_BAM() creates a `tt` object from A `tbl` + (with at least three columns for sample, feature and transcript abundance) + or `SummarizedExperiment` (more convenient if abstracted to tibble with + library(tidySummarizedExperiment)) } \details{ `r lifecycle::badge("maturing")` -This function is based on FeatureCounts package (DOI: 10.1093/bioinformatics/btt656). This function creates a tidybulk object and is useful if you want -to avoid to specify .sample, .transcript and .abundance arguments all the times. -The tidybulk object have an attribute called internals where these three -arguments are stored as metadata. They can be extracted as attr(, "internals"). +This function is based on FeatureCounts package + (DOI: 10.1093/bioinformatics/btt656). This function creates a tidybulk + object and is useful if you want to avoid to specify .sample, + .transcript and .abundance arguments all the times. The tidybulk object + have an attribute called internals where these three arguments are stored + as metadata. They can be extracted as attr(, "internals"). Underlying core function Rsubread::featureCounts(annot.inbuilt = genome,nthreads = n_cores, ...) diff --git a/man/ungroup.Rd b/man/ungroup.Rd new file mode 100644 index 00000000..d634c6df --- /dev/null +++ b/man/ungroup.Rd @@ -0,0 +1,148 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dplyr_methods.R +\name{ungroup} +\alias{ungroup} +\alias{ungroup.tidybulk} +\title{Group by one or more variables} +\usage{ +\method{ungroup}{tidybulk}(x, ...) +} +\arguments{ +\item{x}{A \code{\link[dplyr:tbl]{tbl()}}} + +\item{...}{In \code{group_by()}, variables or computations to group by. +Computations are always done on the ungrouped data frame. +To perform computations on the grouped data, you need to use +a separate \code{mutate()} step before the \code{group_by()}. +Computations are not allowed in \code{nest_by()}. +In \code{ungroup()}, variables to remove from the grouping.} +} +\value{ +A grouped data frame with class \code{\link[dplyr]{grouped_df}}, +unless the combination of \code{...} and \code{add} yields a empty set of +grouping columns, in which case a tibble will be returned. +} +\description{ +Most data operations are done on groups defined by variables. +\code{group_by()} takes an existing tbl and converts it into a grouped tbl +where operations are performed "by group". \code{ungroup()} removes grouping. +} +\section{Methods}{ + + +These function are \strong{generic}s, which means that packages can provide +implementations (methods) for other classes. See the documentation of +individual methods for extra arguments and differences in behaviour. + +Methods available in currently loaded packages: +\itemize{ +\item \code{group_by()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("group_by")}. +\item \code{ungroup()}: \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("ungroup")}. +} + +} + +\section{Ordering}{ + + +Currently, \code{group_by()} internally orders the groups in ascending order. This +results in ordered output from functions that aggregate groups, such as +\code{\link[dplyr:summarise]{summarise()}}. + +When used as grouping columns, character vectors are ordered in the C locale +for performance and reproducibility across R sessions. If the resulting +ordering of your grouped operation matters and is dependent on the locale, +you should follow up the grouped operation with an explicit call to +\code{\link[dplyr:arrange]{arrange()}} and set the \code{.locale} argument. For example: + +\if{html}{\out{
}}\preformatted{data \%>\% + group_by(chr) \%>\% + summarise(avg = mean(x)) \%>\% + arrange(chr, .locale = "en") +}\if{html}{\out{
}} + +This is often useful as a preliminary step before generating content intended +for humans, such as an HTML table. +\subsection{Legacy behavior}{ + +Prior to dplyr 1.1.0, character vector grouping columns were ordered in the +system locale. If you need to temporarily revert to this behavior, you can +set the global option \code{dplyr.legacy_locale} to \code{TRUE}, but this should be +used sparingly and you should expect this option to be removed in a future +version of dplyr. It is better to update existing code to explicitly call +\code{arrange(.locale = )} instead. Note that setting \code{dplyr.legacy_locale} will +also force calls to \code{\link[dplyr:arrange]{arrange()}} to use the system locale. +} + +} + +\examples{ +by_cyl <- mtcars \%>\% group_by(cyl) + +# grouping doesn't change how the data looks (apart from listing +# how it's grouped): +by_cyl + +# It changes how it acts with the other dplyr verbs: +by_cyl \%>\% summarise( + disp = mean(disp), + hp = mean(hp) +) +by_cyl \%>\% filter(disp == max(disp)) + +# Each call to summarise() removes a layer of grouping +by_vs_am <- mtcars \%>\% group_by(vs, am) +by_vs <- by_vs_am \%>\% summarise(n = n()) +by_vs +by_vs \%>\% summarise(n = sum(n)) + +# To removing grouping, use ungroup +by_vs \%>\% + ungroup() \%>\% + summarise(n = sum(n)) + +# By default, group_by() overrides existing grouping +by_cyl \%>\% + group_by(vs, am) \%>\% + group_vars() + +# Use add = TRUE to instead append +by_cyl \%>\% + group_by(vs, am, .add = TRUE) \%>\% + group_vars() + +# You can group by expressions: this is a short-hand +# for a mutate() followed by a group_by() +mtcars \%>\% + group_by(vsam = vs + am) + +# The implicit mutate() step is always performed on the +# ungrouped data. Here we get 3 groups: +mtcars \%>\% + group_by(vs) \%>\% + group_by(hp_cut = cut(hp, 3)) + +# If you want it to be performed by groups, +# you have to use an explicit mutate() call. +# Here we get 3 groups per value of vs +mtcars \%>\% + group_by(vs) \%>\% + mutate(hp_cut = cut(hp, 3)) \%>\% + group_by(hp_cut) + +# when factors are involved and .drop = FALSE, groups can be empty +tbl <- tibble( + x = 1:10, + y = factor(rep(c("a", "c"), each = 5), levels = c("a", "b", "c")) +) +tbl \%>\% + group_by(y, .drop = FALSE) \%>\% + group_rows() +} +\seealso{ +Other grouping functions: +\code{\link[dplyr]{group_map}()}, +\code{\link[dplyr]{group_nest}()}, +\code{\link[dplyr]{group_split}()}, +\code{\link[dplyr]{group_trim}()} +} diff --git a/man/unnest.Rd b/man/unnest.Rd new file mode 100644 index 00000000..68da1d78 --- /dev/null +++ b/man/unnest.Rd @@ -0,0 +1,111 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tidyr_methods.R +\name{unnest} +\alias{unnest} +\alias{unnest.nested_tidybulk} +\title{Unnest a list-column of data frames into rows and columns} +\usage{ +\method{unnest}{nested_tidybulk}( + data, + cols, + ..., + keep_empty = FALSE, + ptype = NULL, + names_sep = NULL, + names_repair = "check_unique", + .drop, + .id, + .sep, + .preserve +) +} +\arguments{ +\item{data}{A data frame.} + +\item{cols}{<\code{\link[tidyr:tidyr_tidy_select]{tidy-select}}> List-columns to unnest. + +When selecting multiple columns, values from the same row will be recycled +to their common size.} + +\item{...}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +previously you could write \code{df \%>\% unnest(x, y, z)}. +Convert to \code{df \%>\% unnest(c(x, y, z))}. If you previously created a new +variable in \code{unnest()} you'll now need to do it explicitly with \code{mutate()}. +Convert \code{df \%>\% unnest(y = fun(x, y, z))} +to \code{df \%>\% mutate(y = fun(x, y, z)) \%>\% unnest(y)}.} + +\item{keep_empty}{By default, you get one row of output for each element +of the list that you are unchopping/unnesting. This means that if there's a +size-0 element (like \code{NULL} or an empty data frame or vector), then that +entire row will be dropped from the output. If you want to preserve all +rows, use \code{keep_empty = TRUE} to replace size-0 elements with a single row +of missing values.} + +\item{ptype}{Optionally, a named list of column name-prototype pairs to +coerce \code{cols} to, overriding the default that will be guessed from +combining the individual values. Alternatively, a single empty ptype +can be supplied, which will be applied to all \code{cols}.} + +\item{names_sep}{If \code{NULL}, the default, the outer names will come from the +inner names. If a string, the outer names will be formed by pasting +together the outer and the inner column names, separated by \code{names_sep}.} + +\item{names_repair}{Used to check that output data frame has valid +names. Must be one of the following options: +\itemize{ +\item \verb{"minimal}": no name repair or checks, beyond basic existence, +\item \verb{"unique}": make sure names are unique and not empty, +\item \verb{"check_unique}": (the default), no name repair, but check they are unique, +\item \verb{"universal}": make the names unique and syntactic +\item a function: apply custom name repair. +\item \link[tidyr]{tidyr_legacy}: use the name repair from tidyr 0.8. +\item a formula: a purrr-style anonymous function (see \code{\link[rlang:as_function]{rlang::as_function()}}) +} + +See \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}} for more details on these terms and the +strategies used to enforce them.} + +\item{.drop, .preserve}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +all list-columns are now preserved; If there are any that you +don't want in the output use \code{select()} to remove them prior to +unnesting.} + +\item{.id}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +convert \code{df \%>\% unnest(x, .id = "id")} to \verb{df \%>\% mutate(id = names(x)) \%>\% unnest(x))}.} + +\item{.sep}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}}: +use \code{names_sep} instead.} +} +\value{ +`tidySingleCellExperiment` +} +\description{ +Unnest expands a list-column containing data frames into rows and columns. +} +\section{New syntax}{ + + +tidyr 1.0.0 introduced a new syntax for \code{nest()} and \code{unnest()} that's +designed to be more similar to other functions. Converting to the new syntax +should be straightforward (guided by the message you'll receive) but if +you just need to run an old analysis, you can easily revert to the previous +behaviour using \code{\link[tidyr:nest_legacy]{nest_legacy()}} and \code{\link[tidyr:unnest_legacy]{unnest_legacy()}} as follows: + +\if{html}{\out{
}}\preformatted{library(tidyr) +nest <- nest_legacy +unnest <- unnest_legacy +}\if{html}{\out{
}} + +} + +\examples{ +data(se_mini) +se_mini |> tidybulk() |> nest( data = -.feature) |> unnest(data) + +} +\seealso{ +Other rectangling: +\code{\link[tidyr]{hoist}()}, +\code{\link[tidyr]{unnest_longer}()}, +\code{\link[tidyr]{unnest_wider}()} +} diff --git a/tests/testthat/test-bulk_methods.R b/tests/testthat/test-bulk_methods.R index 3a32bb11..cc7fa8e6 100755 --- a/tests/testthat/test-bulk_methods.R +++ b/tests/testthat/test-bulk_methods.R @@ -655,12 +655,6 @@ test_that("New method choice",{ test_that("DESeq2 differential trancript abundance - no object",{ - if (find.package("DESeq2", quiet = TRUE) |> length() |> equals(0)) { - if (!requireNamespace("BiocManager", quietly = TRUE)) - install.packages("BiocManager", repos = "https://cloud.r-project.org") - BiocManager::install("DESeq2", ask = FALSE) - } - test_deseq2_df = DESeq2::DESeqDataSet(se_mini,design=~condition) colData(test_deseq2_df)$condition = factor(colData(test_deseq2_df)$condition) diff --git a/tidybulk.Rproj b/tidybulk.Rproj new file mode 100644 index 00000000..21a4da08 --- /dev/null +++ b/tidybulk.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source diff --git a/vignettes/.gitignore b/vignettes/.gitignore deleted file mode 100644 index f1f50d0c..00000000 --- a/vignettes/.gitignore +++ /dev/null @@ -1 +0,0 @@ -manuscript_differential_transcript_abundance_cache