From 34e64fc24e313d48aceceddbfaa01f8e128b1965 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Wed, 8 Mar 2023 17:02:24 +1100 Subject: [PATCH 1/7] WIP implementation of the original suggested API --- NAMESPACE | 2 + R/query.R | 51 +---------------------- R/unharmonised.R | 71 ++++++++++++++++++++++++++++++++ man/get_unharmonised_dataset.Rd | 43 +++++++++++++++++++ man/get_unharmonised_metadata.Rd | 39 ++++++++---------- 5 files changed, 135 insertions(+), 71 deletions(-) create mode 100644 R/unharmonised.R create mode 100644 man/get_unharmonised_dataset.Rd diff --git a/NAMESPACE b/NAMESPACE index 2012c4c..cb89968 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,9 +29,11 @@ importFrom(dplyr,as_tibble) importFrom(dplyr,collect) importFrom(dplyr,filter) importFrom(dplyr,full_join) +importFrom(dplyr,group_by) importFrom(dplyr,inner_join) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,summarise) importFrom(dplyr,tbl) importFrom(dplyr,tibble) importFrom(dplyr,transmute) diff --git a/R/query.R b/R/query.R index 0beb670..e24564c 100644 --- a/R/query.R +++ b/R/query.R @@ -447,53 +447,4 @@ get_metadata <- function( duckdb() |> dbConnect(drv = _, read_only = TRUE) |> tbl(db_path) -} - -#' Returns unharmonised metadata for selected datasets. -#' -#' Various metadata fields are *not* common between datasets, so it does not -#' make sense for these to live in the main metadata table. This function is a -#' utility that allows easy fetching of this data if necessary. -#' -#' @param dataset_ids A character vector, where each entry is a dataset ID -#' obtained from the `$file_id` column of the table returned from -#' [get_metadata()] -#' @param remote_url Optional character vector of length 1. An HTTP URL pointing -#' to the root URL under which all the unharmonised dataset files are located. -#' @param cache_directory Optional character vector of length 1. A file path on -#' your local system to a directory (not a file) that will be used to store -#' the unharmonised metadata files. -#' @importFrom purrr map set_names -#' @importFrom glue glue -#' @importFrom DBI dbConnect -#' @importFrom duckdb duckdb -#' @importFrom dplyr tbl -#' @return A named list, where each name is a dataset file ID, and each value is -#' a "lazy data frame", ie a `tbl`. -#' @export -#' @examples -#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -#' unharmonised_meta = get_unharmonised_metadata(dataset) -#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) -get_unharmonised_metadata = function( - dataset_ids, - remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", - cache_directory = get_default_cache_dir() - ){ - unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised") - duck = duckdb() |> dbConnect(drv = _, read_only = TRUE) - dataset_ids |> - set_names() |> - map(function(dataset_id){ - file_name = glue::glue("{dataset_id}.parquet") - local_path = file.path(unharmonised_root, file_name) - glue("{remote_url}/{file_name}") |> - sync_remote_file( - local_path, - progress(type = "down", con = stderr()) - ) - tbl(duck, local_path) - }) -} +} \ No newline at end of file diff --git a/R/unharmonised.R b/R/unharmonised.R new file mode 100644 index 0000000..4c75d6c --- /dev/null +++ b/R/unharmonised.R @@ -0,0 +1,71 @@ +#' Returns unharmonised metadata for selected datasets. +#' +#' Various metadata fields are *not* common between datasets, so it does not +#' make sense for these to live in the main metadata table. This function is a +#' utility that allows easy fetching of this data if necessary. +#' +#' @param dataset_ids A character vector, where each entry is a dataset ID +#' obtained from the `$file_id` column of the table returned from +#' [get_metadata()] +#' @param remote_url Optional character vector of length 1. An HTTP URL pointing +#' to the root URL under which all the unharmonised dataset files are located. +#' @param cache_directory Optional character vector of length 1. A file path on +#' your local system to a directory (not a file) that will be used to store +#' the unharmonised metadata files. +#' @importFrom purrr map set_names +#' @importFrom glue glue +#' @importFrom DBI dbConnect +#' @importFrom duckdb duckdb +#' @importFrom dplyr tbl filter +#' @return A named list, where each name is a dataset file ID, and each value is +#' a "lazy data frame", ie a `tbl`. +#' @examples +#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" +#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() +#' unharmonised_meta = get_unharmonised_metadata_list(dataset) +#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) +#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +get_unharmonised_dataset = function( + dataset_id, + cells = NULL, + conn = duckdb() |> dbConnect(drv = _, read_only = TRUE), + remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", + cache_directory = get_default_cache_dir() +){ + unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised") + file_name = glue::glue("{dataset_id}.parquet") + local_path = file.path(unharmonised_root, file_name) + glue("{remote_url}/{file_name}") |> + sync_remote_file( + local_path, + progress(type = "down", con = stderr()) + ) + tbl(conn, local_path) |> + filter(cell_ %in% cells) +} + +#' Returns unharmonised metadata for a metadata query +#' @inherit get_unharmonised_dataset description +#' @param metadata A lazy data frame obtained from [get_metadata()], filtered +#' down to some cells of interest +#' @inheritDotParams get_unharmonised_dataset +#' @return A tibble with two columns: +#' * `file_id`: the same `file_id` as the main metadata table obtained from [get_metadata()] +#' * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata +#' @export +#' @importFrom dplyr group_by summarise filter collect +#' @examples +#' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +#' unharmonised <- get_unharmonised_metadata(harmonised) +get_unharmonised_metadata = function(metadata, ...){ + args = list(...) + metadata |> + collect() |> + group_by(file_id) |> + summarise( + unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=metadata$src$con) |> + c(args) |> + do.call(get_unharmonised_dataset, args=_) |> + list() + ) +} \ No newline at end of file diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd new file mode 100644 index 0000000..27c1bde --- /dev/null +++ b/man/get_unharmonised_dataset.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unharmonised.R +\name{get_unharmonised_dataset} +\alias{get_unharmonised_dataset} +\title{Returns unharmonised metadata for selected datasets.} +\usage{ +get_unharmonised_dataset( + dataset_id, + cells = NULL, + conn = dbConnect(drv = duckdb(), read_only = TRUE), + remote_url = + "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", + cache_directory = get_default_cache_dir() +) +} +\arguments{ +\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing +to the root URL under which all the unharmonised dataset files are located.} + +\item{cache_directory}{Optional character vector of length 1. A file path on +your local system to a directory (not a file) that will be used to store +the unharmonised metadata files.} + +\item{dataset_ids}{A character vector, where each entry is a dataset ID +obtained from the \verb{$file_id} column of the table returned from +\code{\link[=get_metadata]{get_metadata()}}} +} +\value{ +A named list, where each name is a dataset file ID, and each value is +a "lazy data frame", ie a \code{tbl}. +} +\description{ +Various metadata fields are \emph{not} common between datasets, so it does not +make sense for these to live in the main metadata table. This function is a +utility that allows easy fetching of this data if necessary. +} +\examples{ +dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" +harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() +unharmonised_meta = get_unharmonised_metadata_list(dataset) +unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) +dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +} diff --git a/man/get_unharmonised_metadata.Rd b/man/get_unharmonised_metadata.Rd index a140d4d..fb7c8a3 100644 --- a/man/get_unharmonised_metadata.Rd +++ b/man/get_unharmonised_metadata.Rd @@ -1,31 +1,31 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/query.R +% Please edit documentation in R/unharmonised.R \name{get_unharmonised_metadata} \alias{get_unharmonised_metadata} -\title{Returns unharmonised metadata for selected datasets.} +\title{Returns unharmonised metadata for a metadata query} \usage{ -get_unharmonised_metadata( - dataset_ids, - remote_url = - "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", - cache_directory = get_default_cache_dir() -) +get_unharmonised_metadata(metadata, ...) } \arguments{ -\item{dataset_ids}{A character vector, where each entry is a dataset ID -obtained from the \verb{$file_id} column of the table returned from -\code{\link[=get_metadata]{get_metadata()}}} +\item{metadata}{A lazy data frame obtained from \code{\link[=get_metadata]{get_metadata()}}, filtered +down to some cells of interest} -\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing +\item{...}{ + Arguments passed on to \code{\link[=get_unharmonised_dataset]{get_unharmonised_dataset}} + \describe{ + \item{\code{remote_url}}{Optional character vector of length 1. An HTTP URL pointing to the root URL under which all the unharmonised dataset files are located.} - -\item{cache_directory}{Optional character vector of length 1. A file path on + \item{\code{cache_directory}}{Optional character vector of length 1. A file path on your local system to a directory (not a file) that will be used to store the unharmonised metadata files.} + }} } \value{ -A named list, where each name is a dataset file ID, and each value is -a "lazy data frame", ie a \code{tbl}. +A tibble with two columns: +\itemize{ +\item \code{file_id}: the same \code{file_id} as the main metadata table obtained from \code{\link[=get_metadata]{get_metadata()}} +\item \code{unharmonised}: a nested tibble, with one row per cell in the input \code{metadata}, containing unharmonised metadata +} } \description{ Various metadata fields are \emph{not} common between datasets, so it does not @@ -33,9 +33,6 @@ make sense for these to live in the main metadata table. This function is a utility that allows easy fetching of this data if necessary. } \examples{ -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) } From 54e59c45d8ab3c4b6905147bc68a2f0226742840 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 13:21:19 +1100 Subject: [PATCH 2/7] Use public connection API --- R/unharmonised.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/unharmonised.R b/R/unharmonised.R index 4c75d6c..efc71d2 100644 --- a/R/unharmonised.R +++ b/R/unharmonised.R @@ -63,7 +63,7 @@ get_unharmonised_metadata = function(metadata, ...){ collect() |> group_by(file_id) |> summarise( - unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=metadata$src$con) |> + unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=dbplyr::remote_con(metadata)) |> c(args) |> do.call(get_unharmonised_dataset, args=_) |> list() From af560a4447365f452ee9b08ecc91ffb8ecef7823 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 16:11:04 +1100 Subject: [PATCH 3/7] R CMD check fixes --- R/unharmonised.R | 16 +++++++++++----- README.Rmd | 22 +++++++++------------ man/get_unharmonised_dataset.Rd | 16 +++++++++++----- man/get_unharmonised_metadata.Rd | 7 +++++++ tests/testthat/test-query.R | 33 +++++++++++++++++++++----------- vignettes/Introduction.Rmd | 20 +++++++++---------- 6 files changed, 69 insertions(+), 45 deletions(-) diff --git a/R/unharmonised.R b/R/unharmonised.R index efc71d2..1c9003e 100644 --- a/R/unharmonised.R +++ b/R/unharmonised.R @@ -4,9 +4,13 @@ #' make sense for these to live in the main metadata table. This function is a #' utility that allows easy fetching of this data if necessary. #' -#' @param dataset_ids A character vector, where each entry is a dataset ID +#' @param dataset_id A character vector, where each entry is a dataset ID #' obtained from the `$file_id` column of the table returned from #' [get_metadata()] +#' @param cells An optional character vector of cell IDs. If provided, only +#' metadata for those cells will be returned. +#' @param conn An optional DuckDB connection object. If provided, it will re-use +#' the existing connection instead of opening a new one. #' @param remote_url Optional character vector of length 1. An HTTP URL pointing #' to the root URL under which all the unharmonised dataset files are located. #' @param cache_directory Optional character vector of length 1. A file path on @@ -17,12 +21,13 @@ #' @importFrom DBI dbConnect #' @importFrom duckdb duckdb #' @importFrom dplyr tbl filter +#' @importFrom rlang .data #' @return A named list, where each name is a dataset file ID, and each value is #' a "lazy data frame", ie a `tbl`. #' @examples #' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" #' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -#' unharmonised_meta = get_unharmonised_metadata_list(dataset) +#' unharmonised_meta = get_unharmonised_dataset(dataset) #' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) #' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) get_unharmonised_dataset = function( @@ -41,7 +46,7 @@ get_unharmonised_dataset = function( progress(type = "down", con = stderr()) ) tbl(conn, local_path) |> - filter(cell_ %in% cells) + filter(.data$cell_ %in% cells) } #' Returns unharmonised metadata for a metadata query @@ -54,6 +59,7 @@ get_unharmonised_dataset = function( #' * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata #' @export #' @importFrom dplyr group_by summarise filter collect +#' @importFrom rlang .data #' @examples #' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") #' unharmonised <- get_unharmonised_metadata(harmonised) @@ -61,9 +67,9 @@ get_unharmonised_metadata = function(metadata, ...){ args = list(...) metadata |> collect() |> - group_by(file_id) |> + group_by(.data$file_id) |> summarise( - unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=dbplyr::remote_con(metadata)) |> + unharmonised = list(dataset_id=.data$file_id[[1]], cells=.data$cell_, conn=dbplyr::remote_con(metadata)) |> c(args) |> do.call(get_unharmonised_dataset, args=_) |> list() diff --git a/README.Rmd b/README.Rmd index fe5f3db..a946119 100644 --- a/README.Rmd +++ b/README.Rmd @@ -279,26 +279,22 @@ knitr::include_graphics("man/figures/HLA_A_tissue_plot.png") Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we can -obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal metadata: +obtain it using the `get_unharmonised_metadata()` function. This function +returns a data frame with one row per dataset, including the `unharmonised` +column which contains unharmnised metadata as a nested data frame. ```{r} -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised ``` -If we have metadata from the normal metadata table that is from a single dataset, -we can even join this additional metadata into one big data frame: +Notice that the columns differ between each dataset's data frame: + ```{r} -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +dplyr::pull(unharmonised, unharmonised) |> head(2) ``` - - # Cell metadata Dataset-specific columns (definitions available at cellxgene.cziscience.com) diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd index 27c1bde..e79ee8e 100644 --- a/man/get_unharmonised_dataset.Rd +++ b/man/get_unharmonised_dataset.Rd @@ -14,16 +14,22 @@ get_unharmonised_dataset( ) } \arguments{ +\item{dataset_id}{A character vector, where each entry is a dataset ID +obtained from the \verb{$file_id} column of the table returned from +\code{\link[=get_metadata]{get_metadata()}}} + +\item{cells}{An optional character vector of cell IDs. If provided, only +metadata for those cells will be returned.} + +\item{conn}{An optional DuckDB connection object. If provided, it will re-use +the existing connection instead of opening a new one.} + \item{remote_url}{Optional character vector of length 1. An HTTP URL pointing to the root URL under which all the unharmonised dataset files are located.} \item{cache_directory}{Optional character vector of length 1. A file path on your local system to a directory (not a file) that will be used to store the unharmonised metadata files.} - -\item{dataset_ids}{A character vector, where each entry is a dataset ID -obtained from the \verb{$file_id} column of the table returned from -\code{\link[=get_metadata]{get_metadata()}}} } \value{ A named list, where each name is a dataset file ID, and each value is @@ -37,7 +43,7 @@ utility that allows easy fetching of this data if necessary. \examples{ dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -unharmonised_meta = get_unharmonised_metadata_list(dataset) +unharmonised_meta = get_unharmonised_dataset(dataset) unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) } diff --git a/man/get_unharmonised_metadata.Rd b/man/get_unharmonised_metadata.Rd index fb7c8a3..9d014f8 100644 --- a/man/get_unharmonised_metadata.Rd +++ b/man/get_unharmonised_metadata.Rd @@ -13,6 +13,13 @@ down to some cells of interest} \item{...}{ Arguments passed on to \code{\link[=get_unharmonised_dataset]{get_unharmonised_dataset}} \describe{ + \item{\code{dataset_id}}{A character vector, where each entry is a dataset ID +obtained from the \verb{$file_id} column of the table returned from +\code{\link[=get_metadata]{get_metadata()}}} + \item{\code{cells}}{An optional character vector of cell IDs. If provided, only +metadata for those cells will be returned.} + \item{\code{conn}}{An optional DuckDB connection object. If provided, it will re-use +the existing connection instead of opening a new one.} \item{\code{remote_url}}{Optional character vector of length 1. An HTTP URL pointing to the root URL under which all the unharmonised dataset files are located.} \item{\code{cache_directory}}{Optional character vector of length 1. A file path on diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index e73bb36..371c759 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -157,18 +157,29 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", { ) }) -test_that("get_unharmonised_metadata works with one ID", { +test_that("get_unharmonised_dataset works with one ID", { dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b" - unharmonised_meta = get_unharmonised_metadata(dataset_id) - unharmonised_tbl = unharmonised_meta[[dataset_id]] - - expect_type(unharmonised_meta, "list") - expect_s3_class(unharmonised_tbl, "tbl") + unharmonised_meta = get_unharmonised_dataset(dataset_id) + + expect_s3_class(unharmonised_meta, "tbl") }) -test_that("get_unharmonised_metadata works with multiple IDs", { - dataset_ids = c("838ea006-2369-4e2c-b426-b2a744a2b02b", "83b9cb97-9ee4-404d-8cdf-ccede8235356") - unharmonised_meta = get_unharmonised_metadata(dataset_ids) +test_that("get_unharmonised_metadata() returns the appropriate data", { + harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") + unharmonised <- get_unharmonised_metadata(harmonised) - expect_equal(names(unharmonised_meta), dataset_ids) -}) + unharmonised |> is.data.frame() |> expect_true() + expect_setequal(colnames(unharmonised), c("file_id", "unharmonised")) + + # The number of cells in both harmonised and unharmonised should be the same + expect_equal( + dplyr::collect(harmonised) |> nrow(), + unharmonised$unharmonised |> purrr::map_int(function(df) dplyr::tally(df) |> dplyr::pull(n)) |> sum() + ) + + # The number of datasets in both harmonised and unharmonised should be the same + expect_equal( + harmonised |> dplyr::group_by(file_id) |> dplyr::n_groups(), + nrow(unharmonised) + ) +}) \ No newline at end of file diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index 1bd6407..fd40f6d 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -297,22 +297,20 @@ knitr::include_graphics("../man/figures/HLA_A_tissue_plot.png") Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we can -obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal metadata: +obtain it using the `get_unharmonised_metadata()` function. This function +returns a data frame with one row per dataset, including the `unharmonised` +column which contains unharmnised metadata as a nested data frame. ```{r} -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised ``` -If we have metadata from the normal metadata table that is from a single dataset, -we can even join this additional metadata into one big data frame: +Notice that the columns differ between each dataset's data frame: + ```{r} -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +dplyr::pull(unharmonised, unharmonised) |> head(2) ``` # Cell metadata From d39cb9e86c82f394bb43c7e293b58100ef3ef514 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 16:13:40 +1100 Subject: [PATCH 4/7] Line endings --- R/query.R | 2 +- R/unharmonised.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/query.R b/R/query.R index e24564c..cb5e7a0 100644 --- a/R/query.R +++ b/R/query.R @@ -447,4 +447,4 @@ get_metadata <- function( duckdb() |> dbConnect(drv = _, read_only = TRUE) |> tbl(db_path) -} \ No newline at end of file +} diff --git a/R/unharmonised.R b/R/unharmonised.R index 1c9003e..9c743e9 100644 --- a/R/unharmonised.R +++ b/R/unharmonised.R @@ -74,4 +74,4 @@ get_unharmonised_metadata = function(metadata, ...){ do.call(get_unharmonised_dataset, args=_) |> list() ) -} \ No newline at end of file +} From dfdfc77fc64ddbca4c4533aa6d0c6e407d4f0037 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 16:15:19 +1100 Subject: [PATCH 5/7] More line endings --- tests/testthat/test-query.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 371c759..17b1bd0 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -182,4 +182,4 @@ test_that("get_unharmonised_metadata() returns the appropriate data", { harmonised |> dplyr::group_by(file_id) |> dplyr::n_groups(), nrow(unharmonised) ) -}) \ No newline at end of file +}) From 626d6145df0a76e1d9c99fa7b21dc7af77b918e3 Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 16:54:39 +1100 Subject: [PATCH 6/7] Rebuild readme, fix example error --- R/unharmonised.R | 9 +++- README.md | 133 ++++++++++++++++++++++++----------------------- 2 files changed, 76 insertions(+), 66 deletions(-) diff --git a/R/unharmonised.R b/R/unharmonised.R index 9c743e9..601744d 100644 --- a/R/unharmonised.R +++ b/R/unharmonised.R @@ -25,11 +25,13 @@ #' @return A named list, where each name is a dataset file ID, and each value is #' a "lazy data frame", ie a `tbl`. #' @examples +#' \dontrun{ #' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" #' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() #' unharmonised_meta = get_unharmonised_dataset(dataset) #' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) #' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +#' } get_unharmonised_dataset = function( dataset_id, cells = NULL, @@ -60,6 +62,7 @@ get_unharmonised_dataset = function( #' @export #' @importFrom dplyr group_by summarise filter collect #' @importFrom rlang .data +#' @importFrom dbplyr remote_con #' @examples #' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") #' unharmonised <- get_unharmonised_metadata(harmonised) @@ -69,7 +72,11 @@ get_unharmonised_metadata = function(metadata, ...){ collect() |> group_by(.data$file_id) |> summarise( - unharmonised = list(dataset_id=.data$file_id[[1]], cells=.data$cell_, conn=dbplyr::remote_con(metadata)) |> + unharmonised = list( + dataset_id=.data$file_id[[1]], + cells=.data$cell_, + conn=remote_con(metadata) + ) |> c(args) |> do.call(get_unharmonised_dataset, args=_) |> list() diff --git a/README.md b/README.md index 6b80025..2624f5b 100644 --- a/README.md +++ b/README.md @@ -70,18 +70,18 @@ metadata |> dplyr::count(tissue) #> # Source: SQL [?? x 2] #> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] -#> tissue n -#> -#> 1 blood 47 -#> 2 respiratory airway 16 -#> 3 mammary gland epithelial cell (cell culture) 1 -#> 4 colon 3 -#> 5 intestine 18 -#> 6 pleural effusion 11 -#> 7 lymph node 15 -#> 8 lung 27 -#> 9 liver 24 -#> 10 axilla 10 +#> tissue n +#> +#> 1 cerebellum 3 +#> 2 telencephalon 2 +#> 3 heart 3 +#> 4 intestine 18 +#> 5 kidney 19 +#> 6 liver 24 +#> 7 lung 27 +#> 8 muscle organ 3 +#> 9 pancreas 5 +#> 10 placenta 3 #> # … with more rows ``` @@ -294,65 +294,68 @@ metadata |> Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we -can obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal -metadata: +can obtain it using the `get_unharmonised_metadata()` function. This +function returns a data frame with one row per dataset, including the +`unharmonised` column which contains unharmnised metadata as a nested +data frame. ``` r -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl -#> # A tibble: 168,860 × 23 -#> cell_ file_id Neuro…¹ Class Subcl…² Super…³ Age.a…⁴ Years…⁵ Cogni…⁶ ADNC -#> -#> 1 GGACGAAG… 838ea0… FALSE Neur… L4 IT L4 IT_2 90+ ye… 16 to … Dement… High -#> 2 TCACGGGA… 838ea0… FALSE Neur… L4 IT L4 IT_1 90+ ye… 12 to … Dement… Inte… -#> 3 TCAGTTTT… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 16 to … No dem… Low -#> 4 TCAGTCCT… 838ea0… FALSE Neur… L4 IT L4 IT_4 78 to … 16 to … Dement… Inte… -#> 5 AGCCACGC… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 19 to … No dem… Inte… -#> 6 CCTCAACC… 838ea0… TRUE Neur… L4 IT L4 IT_2 Less t… Refere… Refere… Refe… -#> 7 CTCGACAA… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 12 to … No dem… Inte… -#> 8 AGCTACAG… 838ea0… FALSE Neur… L4 IT L4 IT_4 90+ ye… 16 to … Dement… High -#> 9 CTCGAGGG… 838ea0… FALSE Neur… L4 IT L4 IT_2 65 to … 16 to … Dement… High -#> 10 AGTGCCGT… 838ea0… FALSE Neur… L4 IT L4 IT_4 90+ ye… 16 to … Dement… High -#> # … with 168,850 more rows, 13 more variables: Braak.stage , -#> # Thal.phase , CERAD.score , APOE4.status , -#> # Lewy.body.disease.pathology , LATE.NC.stage , -#> # Microinfarct.pathology , Specimen.ID , Donor.ID , PMI , -#> # Number.of.UMIs , Genes.detected , -#> # Fraction.mitochrondrial.UMIs , and abbreviated variable names -#> # ¹​Neurotypical.reference, ²​Subclass, ³​Supertype, ⁴​Age.at.death, … +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised +#> # A tibble: 4 × 2 +#> file_id unharmonised +#> +#> 1 63523aa3-0d04-4fc6-ac59-5cadd3e73a14 +#> 2 8fee7b82-178b-4c04-bf23-04689415690d +#> 3 dc9d8cdd-29ee-4c44-830c-6559cb3d0af6 +#> 4 f7e94dbb-8638-4616-aaf9-16e2212c369f ``` -If we have metadata from the normal metadata table that is from a single -dataset, we can even join this additional metadata into one big data -frame: +Notice that the columns differ between each dataset’s data frame: ``` r -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) -#> # A tibble: 168,860 × 77 -#> cell_ sample_ cell_…¹ cell_…² confi…³ cell_…⁴ cell_…⁵ cell_…⁶ sampl…⁷ _samp…⁸ -#> -#> 1 GGAC… f63cb4… L2/3-6… neuron 1 168593… H21.33… -#> 2 TCAC… 0d4d1f… L2/3-6… neuron 1 f7d747… H21.33… -#> 3 TCAG… 3e5a3b… L2/3-6… neuron 1 3417a9… H20.33… -#> 4 TCAG… 7010a3… L2/3-6… neuron 1 246a59… H20.33… -#> 5 AGCC… 82bb9a… L2/3-6… neuron 1 7a8f35… H21.33… -#> 6 CCTC… a233eb… L2/3-6… neuron 1 188243… H18.30… -#> 7 CTCG… 27f104… L2/3-6… neuron 1 a62943… H20.33… -#> 8 AGCT… 0190a2… L2/3-6… neuron 1 c508a8… H20.33… -#> 9 CTCG… 95d846… L2/3-6… neuron 1 29285d… H21.33… -#> 10 AGTG… b0e1c5… L2/3-6… neuron 1 cd7823… H21.33… -#> # … with 168,850 more rows, 67 more variables: assay , -#> # assay_ontology_term_id , file_id_db , -#> # cell_type_ontology_term_id , development_stage , -#> # development_stage_ontology_term_id , disease , -#> # disease_ontology_term_id , ethnicity , -#> # ethnicity_ontology_term_id , experiment___ , file_id , -#> # is_primary_data_x , organism , organism_ontology_term_id , … +dplyr::pull(unharmonised, unharmonised) |> head(2) +#> [[1]] +#> # Source: SQL [?? x 17] +#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] +#> cell_ file_id donor…¹ donor…² libra…³ mappe…⁴ sampl…⁵ suspe…⁶ suspe…⁷ autho…⁸ +#> +#> 1 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 2 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 3 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 4 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 5 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 6 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 7 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 8 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 9 4602… 63523a… 19 mon… 463181… 671785… GENCOD… 125234… cell c7485e… CD4 T … +#> 10 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> # … with more rows, 7 more variables: cell_state , +#> # reported_diseases , Short_Sample , Project , +#> # Experiment , compartment , broad_celltype , and abbreviated +#> # variable names ¹​donor_age, ²​donor_uuid, ³​library_uuid, +#> # ⁴​mapped_reference_annotation, ⁵​sample_uuid, ⁶​suspension_type, +#> # ⁷​suspension_uuid, ⁸​author_cell_type +#> +#> [[2]] +#> # Source: SQL [?? x 12] +#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] +#> cell_ file_id orig.…¹ nCoun…² nFeat…³ seura…⁴ Project donor…⁵ compa…⁶ broad…⁷ +#> +#> 1 1069 8fee7b… 4602ST… 16082 3997 25 Experi… Wilms3 non_PT Pelvic… +#> 2 1214 8fee7b… 4602ST… 1037 606 25 Experi… Wilms3 non_PT Pelvic… +#> 3 2583 8fee7b… 4602ST… 3028 1361 25 Experi… Wilms3 non_PT Pelvic… +#> 4 2655 8fee7b… 4602ST… 1605 859 25 Experi… Wilms3 non_PT Pelvic… +#> 5 3609 8fee7b… 4602ST… 1144 682 25 Experi… Wilms3 non_PT Pelvic… +#> 6 3624 8fee7b… 4602ST… 1874 963 25 Experi… Wilms3 non_PT Pelvic… +#> 7 3946 8fee7b… 4602ST… 1296 755 25 Experi… Wilms3 non_PT Pelvic… +#> 8 5163 8fee7b… 4602ST… 11417 3255 25 Experi… Wilms3 non_PT Pelvic… +#> 9 5446 8fee7b… 4602ST… 1769 946 19 Experi… Wilms2 lympho… CD4 T … +#> 10 6275 8fee7b… 4602ST… 3750 1559 25 Experi… Wilms3 non_PT Pelvic… +#> # … with more rows, 2 more variables: author_cell_type , Sample , and +#> # abbreviated variable names ¹​orig.ident, ²​nCount_RNA, ³​nFeature_RNA, +#> # ⁴​seurat_clusters, ⁵​donor_id, ⁶​compartment, ⁷​broad_celltype ``` # Cell metadata From e0f2a8d331e38c349ff2066ddd69fc1b2f9c0bea Mon Sep 17 00:00:00 2001 From: Michael Milton Date: Thu, 9 Mar 2023 17:13:33 +1100 Subject: [PATCH 7/7] Re-run document --- NAMESPACE | 1 + man/get_unharmonised_dataset.Rd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index cb89968..29842b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,6 +25,7 @@ importFrom(cli,cli_abort) importFrom(cli,cli_alert_info) importFrom(cli,cli_alert_success) importFrom(cli,cli_alert_warning) +importFrom(dbplyr,remote_con) importFrom(dplyr,as_tibble) importFrom(dplyr,collect) importFrom(dplyr,filter) diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd index e79ee8e..0821a23 100644 --- a/man/get_unharmonised_dataset.Rd +++ b/man/get_unharmonised_dataset.Rd @@ -41,9 +41,11 @@ make sense for these to live in the main metadata table. This function is a utility that allows easy fetching of this data if necessary. } \examples{ +\dontrun{ dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() unharmonised_meta = get_unharmonised_dataset(dataset) unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) } +}