diff --git a/NAMESPACE b/NAMESPACE index 2012c4c..29842b8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -25,13 +25,16 @@ importFrom(cli,cli_abort) importFrom(cli,cli_alert_info) importFrom(cli,cli_alert_success) importFrom(cli,cli_alert_warning) +importFrom(dbplyr,remote_con) importFrom(dplyr,as_tibble) importFrom(dplyr,collect) importFrom(dplyr,filter) importFrom(dplyr,full_join) +importFrom(dplyr,group_by) importFrom(dplyr,inner_join) importFrom(dplyr,mutate) importFrom(dplyr,pull) +importFrom(dplyr,summarise) importFrom(dplyr,tbl) importFrom(dplyr,tibble) importFrom(dplyr,transmute) diff --git a/R/query.R b/R/query.R index 0beb670..cb5e7a0 100644 --- a/R/query.R +++ b/R/query.R @@ -448,52 +448,3 @@ get_metadata <- function( dbConnect(drv = _, read_only = TRUE) |> tbl(db_path) } - -#' Returns unharmonised metadata for selected datasets. -#' -#' Various metadata fields are *not* common between datasets, so it does not -#' make sense for these to live in the main metadata table. This function is a -#' utility that allows easy fetching of this data if necessary. -#' -#' @param dataset_ids A character vector, where each entry is a dataset ID -#' obtained from the `$file_id` column of the table returned from -#' [get_metadata()] -#' @param remote_url Optional character vector of length 1. An HTTP URL pointing -#' to the root URL under which all the unharmonised dataset files are located. -#' @param cache_directory Optional character vector of length 1. A file path on -#' your local system to a directory (not a file) that will be used to store -#' the unharmonised metadata files. -#' @importFrom purrr map set_names -#' @importFrom glue glue -#' @importFrom DBI dbConnect -#' @importFrom duckdb duckdb -#' @importFrom dplyr tbl -#' @return A named list, where each name is a dataset file ID, and each value is -#' a "lazy data frame", ie a `tbl`. -#' @export -#' @examples -#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -#' unharmonised_meta = get_unharmonised_metadata(dataset) -#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) -get_unharmonised_metadata = function( - dataset_ids, - remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", - cache_directory = get_default_cache_dir() - ){ - unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised") - duck = duckdb() |> dbConnect(drv = _, read_only = TRUE) - dataset_ids |> - set_names() |> - map(function(dataset_id){ - file_name = glue::glue("{dataset_id}.parquet") - local_path = file.path(unharmonised_root, file_name) - glue("{remote_url}/{file_name}") |> - sync_remote_file( - local_path, - progress(type = "down", con = stderr()) - ) - tbl(duck, local_path) - }) -} diff --git a/R/unharmonised.R b/R/unharmonised.R new file mode 100644 index 0000000..601744d --- /dev/null +++ b/R/unharmonised.R @@ -0,0 +1,84 @@ +#' Returns unharmonised metadata for selected datasets. +#' +#' Various metadata fields are *not* common between datasets, so it does not +#' make sense for these to live in the main metadata table. This function is a +#' utility that allows easy fetching of this data if necessary. +#' +#' @param dataset_id A character vector, where each entry is a dataset ID +#' obtained from the `$file_id` column of the table returned from +#' [get_metadata()] +#' @param cells An optional character vector of cell IDs. If provided, only +#' metadata for those cells will be returned. +#' @param conn An optional DuckDB connection object. If provided, it will re-use +#' the existing connection instead of opening a new one. +#' @param remote_url Optional character vector of length 1. An HTTP URL pointing +#' to the root URL under which all the unharmonised dataset files are located. +#' @param cache_directory Optional character vector of length 1. A file path on +#' your local system to a directory (not a file) that will be used to store +#' the unharmonised metadata files. +#' @importFrom purrr map set_names +#' @importFrom glue glue +#' @importFrom DBI dbConnect +#' @importFrom duckdb duckdb +#' @importFrom dplyr tbl filter +#' @importFrom rlang .data +#' @return A named list, where each name is a dataset file ID, and each value is +#' a "lazy data frame", ie a `tbl`. +#' @examples +#' \dontrun{ +#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" +#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() +#' unharmonised_meta = get_unharmonised_dataset(dataset) +#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) +#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +#' } +get_unharmonised_dataset = function( + dataset_id, + cells = NULL, + conn = duckdb() |> dbConnect(drv = _, read_only = TRUE), + remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", + cache_directory = get_default_cache_dir() +){ + unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised") + file_name = glue::glue("{dataset_id}.parquet") + local_path = file.path(unharmonised_root, file_name) + glue("{remote_url}/{file_name}") |> + sync_remote_file( + local_path, + progress(type = "down", con = stderr()) + ) + tbl(conn, local_path) |> + filter(.data$cell_ %in% cells) +} + +#' Returns unharmonised metadata for a metadata query +#' @inherit get_unharmonised_dataset description +#' @param metadata A lazy data frame obtained from [get_metadata()], filtered +#' down to some cells of interest +#' @inheritDotParams get_unharmonised_dataset +#' @return A tibble with two columns: +#' * `file_id`: the same `file_id` as the main metadata table obtained from [get_metadata()] +#' * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata +#' @export +#' @importFrom dplyr group_by summarise filter collect +#' @importFrom rlang .data +#' @importFrom dbplyr remote_con +#' @examples +#' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +#' unharmonised <- get_unharmonised_metadata(harmonised) +get_unharmonised_metadata = function(metadata, ...){ + args = list(...) + metadata |> + collect() |> + group_by(.data$file_id) |> + summarise( + unharmonised = list( + dataset_id=.data$file_id[[1]], + cells=.data$cell_, + conn=remote_con(metadata) + ) |> + c(args) |> + do.call(get_unharmonised_dataset, args=_) |> + list() + ) +} diff --git a/README.Rmd b/README.Rmd index fe5f3db..a946119 100644 --- a/README.Rmd +++ b/README.Rmd @@ -279,26 +279,22 @@ knitr::include_graphics("man/figures/HLA_A_tissue_plot.png") Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we can -obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal metadata: +obtain it using the `get_unharmonised_metadata()` function. This function +returns a data frame with one row per dataset, including the `unharmonised` +column which contains unharmnised metadata as a nested data frame. ```{r} -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised ``` -If we have metadata from the normal metadata table that is from a single dataset, -we can even join this additional metadata into one big data frame: +Notice that the columns differ between each dataset's data frame: + ```{r} -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +dplyr::pull(unharmonised, unharmonised) |> head(2) ``` - - # Cell metadata Dataset-specific columns (definitions available at cellxgene.cziscience.com) diff --git a/README.md b/README.md index 6b80025..2624f5b 100644 --- a/README.md +++ b/README.md @@ -70,18 +70,18 @@ metadata |> dplyr::count(tissue) #> # Source: SQL [?? x 2] #> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] -#> tissue n -#> -#> 1 blood 47 -#> 2 respiratory airway 16 -#> 3 mammary gland epithelial cell (cell culture) 1 -#> 4 colon 3 -#> 5 intestine 18 -#> 6 pleural effusion 11 -#> 7 lymph node 15 -#> 8 lung 27 -#> 9 liver 24 -#> 10 axilla 10 +#> tissue n +#> +#> 1 cerebellum 3 +#> 2 telencephalon 2 +#> 3 heart 3 +#> 4 intestine 18 +#> 5 kidney 19 +#> 6 liver 24 +#> 7 lung 27 +#> 8 muscle organ 3 +#> 9 pancreas 5 +#> 10 placenta 3 #> # … with more rows ``` @@ -294,65 +294,68 @@ metadata |> Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we -can obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal -metadata: +can obtain it using the `get_unharmonised_metadata()` function. This +function returns a data frame with one row per dataset, including the +`unharmonised` column which contains unharmnised metadata as a nested +data frame. ``` r -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl -#> # A tibble: 168,860 × 23 -#> cell_ file_id Neuro…¹ Class Subcl…² Super…³ Age.a…⁴ Years…⁵ Cogni…⁶ ADNC -#> -#> 1 GGACGAAG… 838ea0… FALSE Neur… L4 IT L4 IT_2 90+ ye… 16 to … Dement… High -#> 2 TCACGGGA… 838ea0… FALSE Neur… L4 IT L4 IT_1 90+ ye… 12 to … Dement… Inte… -#> 3 TCAGTTTT… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 16 to … No dem… Low -#> 4 TCAGTCCT… 838ea0… FALSE Neur… L4 IT L4 IT_4 78 to … 16 to … Dement… Inte… -#> 5 AGCCACGC… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 19 to … No dem… Inte… -#> 6 CCTCAACC… 838ea0… TRUE Neur… L4 IT L4 IT_2 Less t… Refere… Refere… Refe… -#> 7 CTCGACAA… 838ea0… FALSE Neur… L4 IT L4 IT_2 78 to … 12 to … No dem… Inte… -#> 8 AGCTACAG… 838ea0… FALSE Neur… L4 IT L4 IT_4 90+ ye… 16 to … Dement… High -#> 9 CTCGAGGG… 838ea0… FALSE Neur… L4 IT L4 IT_2 65 to … 16 to … Dement… High -#> 10 AGTGCCGT… 838ea0… FALSE Neur… L4 IT L4 IT_4 90+ ye… 16 to … Dement… High -#> # … with 168,850 more rows, 13 more variables: Braak.stage , -#> # Thal.phase , CERAD.score , APOE4.status , -#> # Lewy.body.disease.pathology , LATE.NC.stage , -#> # Microinfarct.pathology , Specimen.ID , Donor.ID , PMI , -#> # Number.of.UMIs , Genes.detected , -#> # Fraction.mitochrondrial.UMIs , and abbreviated variable names -#> # ¹​Neurotypical.reference, ²​Subclass, ³​Supertype, ⁴​Age.at.death, … +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised +#> # A tibble: 4 × 2 +#> file_id unharmonised +#> +#> 1 63523aa3-0d04-4fc6-ac59-5cadd3e73a14 +#> 2 8fee7b82-178b-4c04-bf23-04689415690d +#> 3 dc9d8cdd-29ee-4c44-830c-6559cb3d0af6 +#> 4 f7e94dbb-8638-4616-aaf9-16e2212c369f ``` -If we have metadata from the normal metadata table that is from a single -dataset, we can even join this additional metadata into one big data -frame: +Notice that the columns differ between each dataset’s data frame: ``` r -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) -#> # A tibble: 168,860 × 77 -#> cell_ sample_ cell_…¹ cell_…² confi…³ cell_…⁴ cell_…⁵ cell_…⁶ sampl…⁷ _samp…⁸ -#> -#> 1 GGAC… f63cb4… L2/3-6… neuron 1 168593… H21.33… -#> 2 TCAC… 0d4d1f… L2/3-6… neuron 1 f7d747… H21.33… -#> 3 TCAG… 3e5a3b… L2/3-6… neuron 1 3417a9… H20.33… -#> 4 TCAG… 7010a3… L2/3-6… neuron 1 246a59… H20.33… -#> 5 AGCC… 82bb9a… L2/3-6… neuron 1 7a8f35… H21.33… -#> 6 CCTC… a233eb… L2/3-6… neuron 1 188243… H18.30… -#> 7 CTCG… 27f104… L2/3-6… neuron 1 a62943… H20.33… -#> 8 AGCT… 0190a2… L2/3-6… neuron 1 c508a8… H20.33… -#> 9 CTCG… 95d846… L2/3-6… neuron 1 29285d… H21.33… -#> 10 AGTG… b0e1c5… L2/3-6… neuron 1 cd7823… H21.33… -#> # … with 168,850 more rows, 67 more variables: assay , -#> # assay_ontology_term_id , file_id_db , -#> # cell_type_ontology_term_id , development_stage , -#> # development_stage_ontology_term_id , disease , -#> # disease_ontology_term_id , ethnicity , -#> # ethnicity_ontology_term_id , experiment___ , file_id , -#> # is_primary_data_x , organism , organism_ontology_term_id , … +dplyr::pull(unharmonised, unharmonised) |> head(2) +#> [[1]] +#> # Source: SQL [?? x 17] +#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] +#> cell_ file_id donor…¹ donor…² libra…³ mappe…⁴ sampl…⁵ suspe…⁶ suspe…⁷ autho…⁸ +#> +#> 1 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 2 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 3 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 4 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 5 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 6 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 7 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 8 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> 9 4602… 63523a… 19 mon… 463181… 671785… GENCOD… 125234… cell c7485e… CD4 T … +#> 10 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell d8a44f… Pelvic… +#> # … with more rows, 7 more variables: cell_state , +#> # reported_diseases , Short_Sample , Project , +#> # Experiment , compartment , broad_celltype , and abbreviated +#> # variable names ¹​donor_age, ²​donor_uuid, ³​library_uuid, +#> # ⁴​mapped_reference_annotation, ⁵​sample_uuid, ⁶​suspension_type, +#> # ⁷​suspension_uuid, ⁸​author_cell_type +#> +#> [[2]] +#> # Source: SQL [?? x 12] +#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:] +#> cell_ file_id orig.…¹ nCoun…² nFeat…³ seura…⁴ Project donor…⁵ compa…⁶ broad…⁷ +#> +#> 1 1069 8fee7b… 4602ST… 16082 3997 25 Experi… Wilms3 non_PT Pelvic… +#> 2 1214 8fee7b… 4602ST… 1037 606 25 Experi… Wilms3 non_PT Pelvic… +#> 3 2583 8fee7b… 4602ST… 3028 1361 25 Experi… Wilms3 non_PT Pelvic… +#> 4 2655 8fee7b… 4602ST… 1605 859 25 Experi… Wilms3 non_PT Pelvic… +#> 5 3609 8fee7b… 4602ST… 1144 682 25 Experi… Wilms3 non_PT Pelvic… +#> 6 3624 8fee7b… 4602ST… 1874 963 25 Experi… Wilms3 non_PT Pelvic… +#> 7 3946 8fee7b… 4602ST… 1296 755 25 Experi… Wilms3 non_PT Pelvic… +#> 8 5163 8fee7b… 4602ST… 11417 3255 25 Experi… Wilms3 non_PT Pelvic… +#> 9 5446 8fee7b… 4602ST… 1769 946 19 Experi… Wilms2 lympho… CD4 T … +#> 10 6275 8fee7b… 4602ST… 3750 1559 25 Experi… Wilms3 non_PT Pelvic… +#> # … with more rows, 2 more variables: author_cell_type , Sample , and +#> # abbreviated variable names ¹​orig.ident, ²​nCount_RNA, ³​nFeature_RNA, +#> # ⁴​seurat_clusters, ⁵​donor_id, ⁶​compartment, ⁷​broad_celltype ``` # Cell metadata diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd new file mode 100644 index 0000000..0821a23 --- /dev/null +++ b/man/get_unharmonised_dataset.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/unharmonised.R +\name{get_unharmonised_dataset} +\alias{get_unharmonised_dataset} +\title{Returns unharmonised metadata for selected datasets.} +\usage{ +get_unharmonised_dataset( + dataset_id, + cells = NULL, + conn = dbConnect(drv = duckdb(), read_only = TRUE), + remote_url = + "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", + cache_directory = get_default_cache_dir() +) +} +\arguments{ +\item{dataset_id}{A character vector, where each entry is a dataset ID +obtained from the \verb{$file_id} column of the table returned from +\code{\link[=get_metadata]{get_metadata()}}} + +\item{cells}{An optional character vector of cell IDs. If provided, only +metadata for those cells will be returned.} + +\item{conn}{An optional DuckDB connection object. If provided, it will re-use +the existing connection instead of opening a new one.} + +\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing +to the root URL under which all the unharmonised dataset files are located.} + +\item{cache_directory}{Optional character vector of length 1. A file path on +your local system to a directory (not a file) that will be used to store +the unharmonised metadata files.} +} +\value{ +A named list, where each name is a dataset file ID, and each value is +a "lazy data frame", ie a \code{tbl}. +} +\description{ +Various metadata fields are \emph{not} common between datasets, so it does not +make sense for these to live in the main metadata table. This function is a +utility that allows easy fetching of this data if necessary. +} +\examples{ +\dontrun{ +dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" +harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() +unharmonised_meta = get_unharmonised_dataset(dataset) +unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) +dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +} +} diff --git a/man/get_unharmonised_metadata.Rd b/man/get_unharmonised_metadata.Rd index a140d4d..9d014f8 100644 --- a/man/get_unharmonised_metadata.Rd +++ b/man/get_unharmonised_metadata.Rd @@ -1,31 +1,38 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/query.R +% Please edit documentation in R/unharmonised.R \name{get_unharmonised_metadata} \alias{get_unharmonised_metadata} -\title{Returns unharmonised metadata for selected datasets.} +\title{Returns unharmonised metadata for a metadata query} \usage{ -get_unharmonised_metadata( - dataset_ids, - remote_url = - "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata", - cache_directory = get_default_cache_dir() -) +get_unharmonised_metadata(metadata, ...) } \arguments{ -\item{dataset_ids}{A character vector, where each entry is a dataset ID +\item{metadata}{A lazy data frame obtained from \code{\link[=get_metadata]{get_metadata()}}, filtered +down to some cells of interest} + +\item{...}{ + Arguments passed on to \code{\link[=get_unharmonised_dataset]{get_unharmonised_dataset}} + \describe{ + \item{\code{dataset_id}}{A character vector, where each entry is a dataset ID obtained from the \verb{$file_id} column of the table returned from \code{\link[=get_metadata]{get_metadata()}}} - -\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing + \item{\code{cells}}{An optional character vector of cell IDs. If provided, only +metadata for those cells will be returned.} + \item{\code{conn}}{An optional DuckDB connection object. If provided, it will re-use +the existing connection instead of opening a new one.} + \item{\code{remote_url}}{Optional character vector of length 1. An HTTP URL pointing to the root URL under which all the unharmonised dataset files are located.} - -\item{cache_directory}{Optional character vector of length 1. A file path on + \item{\code{cache_directory}}{Optional character vector of length 1. A file path on your local system to a directory (not a file) that will be used to store the unharmonised metadata files.} + }} } \value{ -A named list, where each name is a dataset file ID, and each value is -a "lazy data frame", ie a \code{tbl}. +A tibble with two columns: +\itemize{ +\item \code{file_id}: the same \code{file_id} as the main metadata table obtained from \code{\link[=get_metadata]{get_metadata()}} +\item \code{unharmonised}: a nested tibble, with one row per cell in the input \code{metadata}, containing unharmonised metadata +} } \description{ Various metadata fields are \emph{not} common between datasets, so it does not @@ -33,9 +40,6 @@ make sense for these to live in the main metadata table. This function is a utility that allows easy fetching of this data if necessary. } \examples{ -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) } diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index e73bb36..17b1bd0 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -157,18 +157,29 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", { ) }) -test_that("get_unharmonised_metadata works with one ID", { +test_that("get_unharmonised_dataset works with one ID", { dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b" - unharmonised_meta = get_unharmonised_metadata(dataset_id) - unharmonised_tbl = unharmonised_meta[[dataset_id]] - - expect_type(unharmonised_meta, "list") - expect_s3_class(unharmonised_tbl, "tbl") + unharmonised_meta = get_unharmonised_dataset(dataset_id) + + expect_s3_class(unharmonised_meta, "tbl") }) -test_that("get_unharmonised_metadata works with multiple IDs", { - dataset_ids = c("838ea006-2369-4e2c-b426-b2a744a2b02b", "83b9cb97-9ee4-404d-8cdf-ccede8235356") - unharmonised_meta = get_unharmonised_metadata(dataset_ids) +test_that("get_unharmonised_metadata() returns the appropriate data", { + harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") + unharmonised <- get_unharmonised_metadata(harmonised) + + unharmonised |> is.data.frame() |> expect_true() + expect_setequal(colnames(unharmonised), c("file_id", "unharmonised")) + + # The number of cells in both harmonised and unharmonised should be the same + expect_equal( + dplyr::collect(harmonised) |> nrow(), + unharmonised$unharmonised |> purrr::map_int(function(df) dplyr::tally(df) |> dplyr::pull(n)) |> sum() + ) - expect_equal(names(unharmonised_meta), dataset_ids) + # The number of datasets in both harmonised and unharmonised should be the same + expect_equal( + harmonised |> dplyr::group_by(file_id) |> dplyr::n_groups(), + nrow(unharmonised) + ) }) diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd index 1bd6407..fd40f6d 100644 --- a/vignettes/Introduction.Rmd +++ b/vignettes/Introduction.Rmd @@ -297,22 +297,20 @@ knitr::include_graphics("../man/figures/HLA_A_tissue_plot.png") Various metadata fields are *not* common between datasets, so it does not make sense for these to live in the main metadata table. However, we can -obtain it using the `get_unharmonised_metadata()` function. - -Note how this table has additional columns that are not in the normal metadata: +obtain it using the `get_unharmonised_metadata()` function. This function +returns a data frame with one row per dataset, including the `unharmonised` +column which contains unharmnised metadata as a nested data frame. ```{r} -dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b" -unharmonised_meta = get_unharmonised_metadata(dataset) -unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]]) -unharmonised_tbl +harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel") +unharmonised <- get_unharmonised_metadata(harmonised) +unharmonised ``` -If we have metadata from the normal metadata table that is from a single dataset, -we can even join this additional metadata into one big data frame: +Notice that the columns differ between each dataset's data frame: + ```{r} -harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect() -dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_")) +dplyr::pull(unharmonised, unharmonised) |> head(2) ``` # Cell metadata