From 34e64fc24e313d48aceceddbfaa01f8e128b1965 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Wed, 8 Mar 2023 17:02:24 +1100
Subject: [PATCH 1/7] WIP implementation of the original suggested API

---
 NAMESPACE                        |  2 +
 R/query.R                        | 51 +----------------------
 R/unharmonised.R                 | 71 ++++++++++++++++++++++++++++++++
 man/get_unharmonised_dataset.Rd  | 43 +++++++++++++++++++
 man/get_unharmonised_metadata.Rd | 39 ++++++++----------
 5 files changed, 135 insertions(+), 71 deletions(-)
 create mode 100644 R/unharmonised.R
 create mode 100644 man/get_unharmonised_dataset.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 2012c4c..cb89968 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -29,9 +29,11 @@ importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
 importFrom(dplyr,filter)
 importFrom(dplyr,full_join)
+importFrom(dplyr,group_by)
 importFrom(dplyr,inner_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
+importFrom(dplyr,summarise)
 importFrom(dplyr,tbl)
 importFrom(dplyr,tibble)
 importFrom(dplyr,transmute)
diff --git a/R/query.R b/R/query.R
index 0beb670..e24564c 100644
--- a/R/query.R
+++ b/R/query.R
@@ -447,53 +447,4 @@ get_metadata <- function(
     duckdb() |>
         dbConnect(drv = _, read_only = TRUE) |>
         tbl(db_path)
-}
-
-#' Returns unharmonised metadata for selected datasets.
-#'
-#' Various metadata fields are *not* common between datasets, so it does not
-#' make sense for these to live in the main metadata table. This function is a
-#' utility that allows easy fetching of this data if necessary.
-#'
-#' @param dataset_ids A character vector, where each entry is a dataset ID
-#'   obtained from the `$file_id` column of the table returned from
-#'   [get_metadata()]
-#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
-#'   to the root URL under which all the unharmonised dataset files are located.
-#' @param cache_directory Optional character vector of length 1. A file path on
-#'   your local system to a directory (not a file) that will be used to store
-#'   the unharmonised metadata files.
-#' @importFrom purrr map set_names
-#' @importFrom glue glue
-#' @importFrom DBI dbConnect
-#' @importFrom duckdb duckdb
-#' @importFrom dplyr tbl
-#' @return A named list, where each name is a dataset file ID, and each value is
-#'   a "lazy data frame", ie a `tbl`.
-#' @export
-#' @examples
-#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-#' unharmonised_meta = get_unharmonised_metadata(dataset)
-#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
-#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
-get_unharmonised_metadata = function(
-        dataset_ids,
-        remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
-        cache_directory = get_default_cache_dir()
-        ){
-    unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
-    duck = duckdb() |> dbConnect(drv = _, read_only = TRUE)
-    dataset_ids |> 
-        set_names() |>
-        map(function(dataset_id){
-            file_name = glue::glue("{dataset_id}.parquet")
-            local_path = file.path(unharmonised_root, file_name)
-            glue("{remote_url}/{file_name}") |>
-                sync_remote_file(
-                local_path,
-                progress(type = "down", con = stderr())
-            )
-            tbl(duck, local_path)
-        })
-}
+}
\ No newline at end of file
diff --git a/R/unharmonised.R b/R/unharmonised.R
new file mode 100644
index 0000000..4c75d6c
--- /dev/null
+++ b/R/unharmonised.R
@@ -0,0 +1,71 @@
+#' Returns unharmonised metadata for selected datasets.
+#'
+#' Various metadata fields are *not* common between datasets, so it does not
+#' make sense for these to live in the main metadata table. This function is a
+#' utility that allows easy fetching of this data if necessary.
+#'
+#' @param dataset_ids A character vector, where each entry is a dataset ID
+#'   obtained from the `$file_id` column of the table returned from
+#'   [get_metadata()]
+#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
+#'   to the root URL under which all the unharmonised dataset files are located.
+#' @param cache_directory Optional character vector of length 1. A file path on
+#'   your local system to a directory (not a file) that will be used to store
+#'   the unharmonised metadata files.
+#' @importFrom purrr map set_names
+#' @importFrom glue glue
+#' @importFrom DBI dbConnect
+#' @importFrom duckdb duckdb
+#' @importFrom dplyr tbl filter
+#' @return A named list, where each name is a dataset file ID, and each value is
+#'   a "lazy data frame", ie a `tbl`.
+#' @examples
+#' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
+#' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
+#' unharmonised_meta = get_unharmonised_metadata_list(dataset)
+#' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
+#' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+get_unharmonised_dataset = function(
+        dataset_id,
+        cells = NULL,
+        conn = duckdb() |> dbConnect(drv = _, read_only = TRUE),
+        remote_url = "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
+        cache_directory = get_default_cache_dir()
+){
+    unharmonised_root <- file.path(cache_directory, COUNTS_VERSION, "unharmonised")
+    file_name = glue::glue("{dataset_id}.parquet")
+    local_path = file.path(unharmonised_root, file_name)
+    glue("{remote_url}/{file_name}") |>
+        sync_remote_file(
+            local_path,
+            progress(type = "down", con = stderr())
+        )
+    tbl(conn, local_path) |>
+        filter(cell_ %in% cells)
+}
+
+#' Returns unharmonised metadata for a metadata query
+#' @inherit get_unharmonised_dataset description
+#' @param metadata A lazy data frame obtained from [get_metadata()], filtered
+#'  down to some cells of interest
+#' @inheritDotParams get_unharmonised_dataset
+#' @return A tibble with two columns:
+#'  * `file_id`: the same `file_id` as the main metadata table obtained from [get_metadata()]
+#'  * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata
+#' @export
+#' @importFrom dplyr group_by summarise filter collect
+#' @examples
+#' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+#' unharmonised <- get_unharmonised_metadata(harmonised)
+get_unharmonised_metadata = function(metadata, ...){
+    args = list(...)
+    metadata |>
+        collect() |>
+        group_by(file_id) |>
+        summarise(
+            unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=metadata$src$con) |>
+                c(args) |> 
+                do.call(get_unharmonised_dataset, args=_) |> 
+                list()
+        )
+}
\ No newline at end of file
diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd
new file mode 100644
index 0000000..27c1bde
--- /dev/null
+++ b/man/get_unharmonised_dataset.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/unharmonised.R
+\name{get_unharmonised_dataset}
+\alias{get_unharmonised_dataset}
+\title{Returns unharmonised metadata for selected datasets.}
+\usage{
+get_unharmonised_dataset(
+  dataset_id,
+  cells = NULL,
+  conn = dbConnect(drv = duckdb(), read_only = TRUE),
+  remote_url =
+    "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
+  cache_directory = get_default_cache_dir()
+)
+}
+\arguments{
+\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing
+to the root URL under which all the unharmonised dataset files are located.}
+
+\item{cache_directory}{Optional character vector of length 1. A file path on
+your local system to a directory (not a file) that will be used to store
+the unharmonised metadata files.}
+
+\item{dataset_ids}{A character vector, where each entry is a dataset ID
+obtained from the \verb{$file_id} column of the table returned from
+\code{\link[=get_metadata]{get_metadata()}}}
+}
+\value{
+A named list, where each name is a dataset file ID, and each value is
+a "lazy data frame", ie a \code{tbl}.
+}
+\description{
+Various metadata fields are \emph{not} common between datasets, so it does not
+make sense for these to live in the main metadata table. This function is a
+utility that allows easy fetching of this data if necessary.
+}
+\examples{
+dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
+harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
+unharmonised_meta = get_unharmonised_metadata_list(dataset)
+unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
+dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+}
diff --git a/man/get_unharmonised_metadata.Rd b/man/get_unharmonised_metadata.Rd
index a140d4d..fb7c8a3 100644
--- a/man/get_unharmonised_metadata.Rd
+++ b/man/get_unharmonised_metadata.Rd
@@ -1,31 +1,31 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/query.R
+% Please edit documentation in R/unharmonised.R
 \name{get_unharmonised_metadata}
 \alias{get_unharmonised_metadata}
-\title{Returns unharmonised metadata for selected datasets.}
+\title{Returns unharmonised metadata for a metadata query}
 \usage{
-get_unharmonised_metadata(
-  dataset_ids,
-  remote_url =
-    "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/unharmonised_metadata",
-  cache_directory = get_default_cache_dir()
-)
+get_unharmonised_metadata(metadata, ...)
 }
 \arguments{
-\item{dataset_ids}{A character vector, where each entry is a dataset ID
-obtained from the \verb{$file_id} column of the table returned from
-\code{\link[=get_metadata]{get_metadata()}}}
+\item{metadata}{A lazy data frame obtained from \code{\link[=get_metadata]{get_metadata()}}, filtered
+down to some cells of interest}
 
-\item{remote_url}{Optional character vector of length 1. An HTTP URL pointing
+\item{...}{
+  Arguments passed on to \code{\link[=get_unharmonised_dataset]{get_unharmonised_dataset}}
+  \describe{
+    \item{\code{remote_url}}{Optional character vector of length 1. An HTTP URL pointing
 to the root URL under which all the unharmonised dataset files are located.}
-
-\item{cache_directory}{Optional character vector of length 1. A file path on
+    \item{\code{cache_directory}}{Optional character vector of length 1. A file path on
 your local system to a directory (not a file) that will be used to store
 the unharmonised metadata files.}
+  }}
 }
 \value{
-A named list, where each name is a dataset file ID, and each value is
-a "lazy data frame", ie a \code{tbl}.
+A tibble with two columns:
+\itemize{
+\item \code{file_id}: the same \code{file_id} as the main metadata table obtained from \code{\link[=get_metadata]{get_metadata()}}
+\item \code{unharmonised}: a nested tibble, with one row per cell in the input \code{metadata}, containing unharmonised metadata
+}
 }
 \description{
 Various metadata fields are \emph{not} common between datasets, so it does not
@@ -33,9 +33,6 @@ make sense for these to live in the main metadata table. This function is a
 utility that allows easy fetching of this data if necessary.
 }
 \examples{
-dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-unharmonised_meta = get_unharmonised_metadata(dataset)
-unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
-dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+unharmonised <- get_unharmonised_metadata(harmonised)
 }

From 54e59c45d8ab3c4b6905147bc68a2f0226742840 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 13:21:19 +1100
Subject: [PATCH 2/7] Use public connection API

---
 R/unharmonised.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/unharmonised.R b/R/unharmonised.R
index 4c75d6c..efc71d2 100644
--- a/R/unharmonised.R
+++ b/R/unharmonised.R
@@ -63,7 +63,7 @@ get_unharmonised_metadata = function(metadata, ...){
         collect() |>
         group_by(file_id) |>
         summarise(
-            unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=metadata$src$con) |>
+            unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=dbplyr::remote_con(metadata)) |>
                 c(args) |> 
                 do.call(get_unharmonised_dataset, args=_) |> 
                 list()

From af560a4447365f452ee9b08ecc91ffb8ecef7823 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 16:11:04 +1100
Subject: [PATCH 3/7] R CMD check fixes

---
 R/unharmonised.R                 | 16 +++++++++++-----
 README.Rmd                       | 22 +++++++++------------
 man/get_unharmonised_dataset.Rd  | 16 +++++++++++-----
 man/get_unharmonised_metadata.Rd |  7 +++++++
 tests/testthat/test-query.R      | 33 +++++++++++++++++++++-----------
 vignettes/Introduction.Rmd       | 20 +++++++++----------
 6 files changed, 69 insertions(+), 45 deletions(-)

diff --git a/R/unharmonised.R b/R/unharmonised.R
index efc71d2..1c9003e 100644
--- a/R/unharmonised.R
+++ b/R/unharmonised.R
@@ -4,9 +4,13 @@
 #' make sense for these to live in the main metadata table. This function is a
 #' utility that allows easy fetching of this data if necessary.
 #'
-#' @param dataset_ids A character vector, where each entry is a dataset ID
+#' @param dataset_id A character vector, where each entry is a dataset ID
 #'   obtained from the `$file_id` column of the table returned from
 #'   [get_metadata()]
+#' @param cells An optional character vector of cell IDs. If provided, only
+#'   metadata for those cells will be returned.
+#' @param conn An optional DuckDB connection object. If provided, it will re-use
+#'   the existing connection instead of opening a new one.
 #' @param remote_url Optional character vector of length 1. An HTTP URL pointing
 #'   to the root URL under which all the unharmonised dataset files are located.
 #' @param cache_directory Optional character vector of length 1. A file path on
@@ -17,12 +21,13 @@
 #' @importFrom DBI dbConnect
 #' @importFrom duckdb duckdb
 #' @importFrom dplyr tbl filter
+#' @importFrom rlang .data
 #' @return A named list, where each name is a dataset file ID, and each value is
 #'   a "lazy data frame", ie a `tbl`.
 #' @examples
 #' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
 #' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-#' unharmonised_meta = get_unharmonised_metadata_list(dataset)
+#' unharmonised_meta = get_unharmonised_dataset(dataset)
 #' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
 #' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
 get_unharmonised_dataset = function(
@@ -41,7 +46,7 @@ get_unharmonised_dataset = function(
             progress(type = "down", con = stderr())
         )
     tbl(conn, local_path) |>
-        filter(cell_ %in% cells)
+        filter(.data$cell_ %in% cells)
 }
 
 #' Returns unharmonised metadata for a metadata query
@@ -54,6 +59,7 @@ get_unharmonised_dataset = function(
 #'  * `unharmonised`: a nested tibble, with one row per cell in the input `metadata`, containing unharmonised metadata
 #' @export
 #' @importFrom dplyr group_by summarise filter collect
+#' @importFrom rlang .data
 #' @examples
 #' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
 #' unharmonised <- get_unharmonised_metadata(harmonised)
@@ -61,9 +67,9 @@ get_unharmonised_metadata = function(metadata, ...){
     args = list(...)
     metadata |>
         collect() |>
-        group_by(file_id) |>
+        group_by(.data$file_id) |>
         summarise(
-            unharmonised = list(dataset_id=file_id[[1]], cells=cell_, conn=dbplyr::remote_con(metadata)) |>
+            unharmonised = list(dataset_id=.data$file_id[[1]], cells=.data$cell_, conn=dbplyr::remote_con(metadata)) |>
                 c(args) |> 
                 do.call(get_unharmonised_dataset, args=_) |> 
                 list()
diff --git a/README.Rmd b/README.Rmd
index fe5f3db..a946119 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -279,26 +279,22 @@ knitr::include_graphics("man/figures/HLA_A_tissue_plot.png")
 
 Various metadata fields are *not* common between datasets, so it does not
 make sense for these to live in the main metadata table. However, we can
-obtain it using the `get_unharmonised_metadata()` function.
-
-Note how this table has additional columns that are not in the normal metadata:
+obtain it using the `get_unharmonised_metadata()` function. This function
+returns a data frame with one row per dataset, including the `unharmonised` 
+column which contains unharmnised metadata as a nested data frame.
 
 ```{r}
-dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-unharmonised_meta = get_unharmonised_metadata(dataset)
-unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
-unharmonised_tbl
+harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+unharmonised <- get_unharmonised_metadata(harmonised)
+unharmonised
 ```
 
-If we have metadata from the normal metadata table that is from a single dataset,
-we can even join this additional metadata into one big data frame:
+Notice that the columns differ between each dataset's data frame:
+
 ```{r}
-harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+dplyr::pull(unharmonised, unharmonised) |> head(2)
 ```
 
-
-
 # Cell metadata
 
 Dataset-specific columns (definitions available at cellxgene.cziscience.com)
diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd
index 27c1bde..e79ee8e 100644
--- a/man/get_unharmonised_dataset.Rd
+++ b/man/get_unharmonised_dataset.Rd
@@ -14,16 +14,22 @@ get_unharmonised_dataset(
 )
 }
 \arguments{
+\item{dataset_id}{A character vector, where each entry is a dataset ID
+obtained from the \verb{$file_id} column of the table returned from
+\code{\link[=get_metadata]{get_metadata()}}}
+
+\item{cells}{An optional character vector of cell IDs. If provided, only
+metadata for those cells will be returned.}
+
+\item{conn}{An optional DuckDB connection object. If provided, it will re-use
+the existing connection instead of opening a new one.}
+
 \item{remote_url}{Optional character vector of length 1. An HTTP URL pointing
 to the root URL under which all the unharmonised dataset files are located.}
 
 \item{cache_directory}{Optional character vector of length 1. A file path on
 your local system to a directory (not a file) that will be used to store
 the unharmonised metadata files.}
-
-\item{dataset_ids}{A character vector, where each entry is a dataset ID
-obtained from the \verb{$file_id} column of the table returned from
-\code{\link[=get_metadata]{get_metadata()}}}
 }
 \value{
 A named list, where each name is a dataset file ID, and each value is
@@ -37,7 +43,7 @@ utility that allows easy fetching of this data if necessary.
 \examples{
 dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
 harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-unharmonised_meta = get_unharmonised_metadata_list(dataset)
+unharmonised_meta = get_unharmonised_dataset(dataset)
 unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
 dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
 }
diff --git a/man/get_unharmonised_metadata.Rd b/man/get_unharmonised_metadata.Rd
index fb7c8a3..9d014f8 100644
--- a/man/get_unharmonised_metadata.Rd
+++ b/man/get_unharmonised_metadata.Rd
@@ -13,6 +13,13 @@ down to some cells of interest}
 \item{...}{
   Arguments passed on to \code{\link[=get_unharmonised_dataset]{get_unharmonised_dataset}}
   \describe{
+    \item{\code{dataset_id}}{A character vector, where each entry is a dataset ID
+obtained from the \verb{$file_id} column of the table returned from
+\code{\link[=get_metadata]{get_metadata()}}}
+    \item{\code{cells}}{An optional character vector of cell IDs. If provided, only
+metadata for those cells will be returned.}
+    \item{\code{conn}}{An optional DuckDB connection object. If provided, it will re-use
+the existing connection instead of opening a new one.}
     \item{\code{remote_url}}{Optional character vector of length 1. An HTTP URL pointing
 to the root URL under which all the unharmonised dataset files are located.}
     \item{\code{cache_directory}}{Optional character vector of length 1. A file path on
diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
index e73bb36..371c759 100755
--- a/tests/testthat/test-query.R
+++ b/tests/testthat/test-query.R
@@ -157,18 +157,29 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {
     )
 })
 
-test_that("get_unharmonised_metadata works with one ID", {
+test_that("get_unharmonised_dataset works with one ID", {
     dataset_id = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-    unharmonised_meta = get_unharmonised_metadata(dataset_id)
-    unharmonised_tbl = unharmonised_meta[[dataset_id]]
-    
-    expect_type(unharmonised_meta, "list")
-    expect_s3_class(unharmonised_tbl, "tbl")
+    unharmonised_meta = get_unharmonised_dataset(dataset_id)
+
+    expect_s3_class(unharmonised_meta, "tbl")
 })
 
-test_that("get_unharmonised_metadata works with multiple IDs", {
-    dataset_ids = c("838ea006-2369-4e2c-b426-b2a744a2b02b", "83b9cb97-9ee4-404d-8cdf-ccede8235356")
-    unharmonised_meta = get_unharmonised_metadata(dataset_ids)
+test_that("get_unharmonised_metadata() returns the appropriate data", {
+    harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+    unharmonised <- get_unharmonised_metadata(harmonised)
     
-    expect_equal(names(unharmonised_meta), dataset_ids)
-})
+    unharmonised |> is.data.frame() |> expect_true()
+    expect_setequal(colnames(unharmonised), c("file_id", "unharmonised"))
+    
+    # The number of cells in both harmonised and unharmonised should be the same
+    expect_equal(
+        dplyr::collect(harmonised) |> nrow(),
+        unharmonised$unharmonised |> purrr::map_int(function(df) dplyr::tally(df) |> dplyr::pull(n)) |> sum()
+    )
+    
+    # The number of datasets in both harmonised and unharmonised should be the same
+    expect_equal(
+        harmonised |> dplyr::group_by(file_id) |> dplyr::n_groups(),
+        nrow(unharmonised)
+    )
+})
\ No newline at end of file
diff --git a/vignettes/Introduction.Rmd b/vignettes/Introduction.Rmd
index 1bd6407..fd40f6d 100644
--- a/vignettes/Introduction.Rmd
+++ b/vignettes/Introduction.Rmd
@@ -297,22 +297,20 @@ knitr::include_graphics("../man/figures/HLA_A_tissue_plot.png")
 
 Various metadata fields are *not* common between datasets, so it does not
 make sense for these to live in the main metadata table. However, we can
-obtain it using the `get_unharmonised_metadata()` function.
-
-Note how this table has additional columns that are not in the normal metadata:
+obtain it using the `get_unharmonised_metadata()` function. This function
+returns a data frame with one row per dataset, including the `unharmonised` 
+column which contains unharmnised metadata as a nested data frame.
 
 ```{r}
-dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-unharmonised_meta = get_unharmonised_metadata(dataset)
-unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
-unharmonised_tbl
+harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+unharmonised <- get_unharmonised_metadata(harmonised)
+unharmonised
 ```
 
-If we have metadata from the normal metadata table that is from a single dataset,
-we can even join this additional metadata into one big data frame:
+Notice that the columns differ between each dataset's data frame:
+
 ```{r}
-harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+dplyr::pull(unharmonised, unharmonised) |> head(2)
 ```
 
 # Cell metadata

From d39cb9e86c82f394bb43c7e293b58100ef3ef514 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 16:13:40 +1100
Subject: [PATCH 4/7] Line endings

---
 R/query.R        | 2 +-
 R/unharmonised.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/query.R b/R/query.R
index e24564c..cb5e7a0 100644
--- a/R/query.R
+++ b/R/query.R
@@ -447,4 +447,4 @@ get_metadata <- function(
     duckdb() |>
         dbConnect(drv = _, read_only = TRUE) |>
         tbl(db_path)
-}
\ No newline at end of file
+}
diff --git a/R/unharmonised.R b/R/unharmonised.R
index 1c9003e..9c743e9 100644
--- a/R/unharmonised.R
+++ b/R/unharmonised.R
@@ -74,4 +74,4 @@ get_unharmonised_metadata = function(metadata, ...){
                 do.call(get_unharmonised_dataset, args=_) |> 
                 list()
         )
-}
\ No newline at end of file
+}

From dfdfc77fc64ddbca4c4533aa6d0c6e407d4f0037 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 16:15:19 +1100
Subject: [PATCH 5/7] More line endings

---
 tests/testthat/test-query.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
index 371c759..17b1bd0 100755
--- a/tests/testthat/test-query.R
+++ b/tests/testthat/test-query.R
@@ -182,4 +182,4 @@ test_that("get_unharmonised_metadata() returns the appropriate data", {
         harmonised |> dplyr::group_by(file_id) |> dplyr::n_groups(),
         nrow(unharmonised)
     )
-})
\ No newline at end of file
+})

From 626d6145df0a76e1d9c99fa7b21dc7af77b918e3 Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 16:54:39 +1100
Subject: [PATCH 6/7] Rebuild readme, fix example error

---
 R/unharmonised.R |   9 +++-
 README.md        | 133 ++++++++++++++++++++++++-----------------------
 2 files changed, 76 insertions(+), 66 deletions(-)

diff --git a/R/unharmonised.R b/R/unharmonised.R
index 9c743e9..601744d 100644
--- a/R/unharmonised.R
+++ b/R/unharmonised.R
@@ -25,11 +25,13 @@
 #' @return A named list, where each name is a dataset file ID, and each value is
 #'   a "lazy data frame", ie a `tbl`.
 #' @examples
+#' \dontrun{
 #' dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
 #' harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
 #' unharmonised_meta = get_unharmonised_dataset(dataset)
 #' unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
 #' dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
+#' }
 get_unharmonised_dataset = function(
         dataset_id,
         cells = NULL,
@@ -60,6 +62,7 @@ get_unharmonised_dataset = function(
 #' @export
 #' @importFrom dplyr group_by summarise filter collect
 #' @importFrom rlang .data
+#' @importFrom dbplyr remote_con
 #' @examples
 #' harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
 #' unharmonised <- get_unharmonised_metadata(harmonised)
@@ -69,7 +72,11 @@ get_unharmonised_metadata = function(metadata, ...){
         collect() |>
         group_by(.data$file_id) |>
         summarise(
-            unharmonised = list(dataset_id=.data$file_id[[1]], cells=.data$cell_, conn=dbplyr::remote_con(metadata)) |>
+            unharmonised = list(
+              dataset_id=.data$file_id[[1]],
+              cells=.data$cell_,
+              conn=remote_con(metadata)
+            ) |>
                 c(args) |> 
                 do.call(get_unharmonised_dataset, args=_) |> 
                 list()
diff --git a/README.md b/README.md
index 6b80025..2624f5b 100644
--- a/README.md
+++ b/README.md
@@ -70,18 +70,18 @@ metadata |>
   dplyr::count(tissue)
 #> # Source:   SQL [?? x 2]
 #> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:]
-#>    tissue                                           n
-#>    <chr>                                        <dbl>
-#>  1 blood                                           47
-#>  2 respiratory airway                              16
-#>  3 mammary gland epithelial cell (cell culture)     1
-#>  4 colon                                            3
-#>  5 intestine                                       18
-#>  6 pleural effusion                                11
-#>  7 lymph node                                      15
-#>  8 lung                                            27
-#>  9 liver                                           24
-#> 10 axilla                                          10
+#>    tissue            n
+#>    <chr>         <dbl>
+#>  1 cerebellum        3
+#>  2 telencephalon     2
+#>  3 heart             3
+#>  4 intestine        18
+#>  5 kidney           19
+#>  6 liver            24
+#>  7 lung             27
+#>  8 muscle organ      3
+#>  9 pancreas          5
+#> 10 placenta          3
 #> # … with more rows
 ```
 
@@ -294,65 +294,68 @@ metadata |>
 
 Various metadata fields are *not* common between datasets, so it does
 not make sense for these to live in the main metadata table. However, we
-can obtain it using the `get_unharmonised_metadata()` function.
-
-Note how this table has additional columns that are not in the normal
-metadata:
+can obtain it using the `get_unharmonised_metadata()` function. This
+function returns a data frame with one row per dataset, including the
+`unharmonised` column which contains unharmnised metadata as a nested
+data frame.
 
 ``` r
-dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
-unharmonised_meta = get_unharmonised_metadata(dataset)
-unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
-unharmonised_tbl
-#> # A tibble: 168,860 × 23
-#>    cell_     file_id Neuro…¹ Class Subcl…² Super…³ Age.a…⁴ Years…⁵ Cogni…⁶ ADNC 
-#>    <chr>     <chr>   <lgl>   <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>
-#>  1 GGACGAAG… 838ea0… FALSE   Neur… L4 IT   L4 IT_2 90+ ye… 16 to … Dement… High 
-#>  2 TCACGGGA… 838ea0… FALSE   Neur… L4 IT   L4 IT_1 90+ ye… 12 to … Dement… Inte…
-#>  3 TCAGTTTT… 838ea0… FALSE   Neur… L4 IT   L4 IT_2 78 to … 16 to … No dem… Low  
-#>  4 TCAGTCCT… 838ea0… FALSE   Neur… L4 IT   L4 IT_4 78 to … 16 to … Dement… Inte…
-#>  5 AGCCACGC… 838ea0… FALSE   Neur… L4 IT   L4 IT_2 78 to … 19 to … No dem… Inte…
-#>  6 CCTCAACC… 838ea0… TRUE    Neur… L4 IT   L4 IT_2 Less t… Refere… Refere… Refe…
-#>  7 CTCGACAA… 838ea0… FALSE   Neur… L4 IT   L4 IT_2 78 to … 12 to … No dem… Inte…
-#>  8 AGCTACAG… 838ea0… FALSE   Neur… L4 IT   L4 IT_4 90+ ye… 16 to … Dement… High 
-#>  9 CTCGAGGG… 838ea0… FALSE   Neur… L4 IT   L4 IT_2 65 to … 16 to … Dement… High 
-#> 10 AGTGCCGT… 838ea0… FALSE   Neur… L4 IT   L4 IT_4 90+ ye… 16 to … Dement… High 
-#> # … with 168,850 more rows, 13 more variables: Braak.stage <chr>,
-#> #   Thal.phase <chr>, CERAD.score <chr>, APOE4.status <chr>,
-#> #   Lewy.body.disease.pathology <chr>, LATE.NC.stage <chr>,
-#> #   Microinfarct.pathology <chr>, Specimen.ID <chr>, Donor.ID <chr>, PMI <chr>,
-#> #   Number.of.UMIs <dbl>, Genes.detected <dbl>,
-#> #   Fraction.mitochrondrial.UMIs <dbl>, and abbreviated variable names
-#> #   ¹​Neurotypical.reference, ²​Subclass, ³​Supertype, ⁴​Age.at.death, …
+harmonised <- get_metadata() |> dplyr::filter(tissue == "kidney blood vessel")
+unharmonised <- get_unharmonised_metadata(harmonised)
+unharmonised
+#> # A tibble: 4 × 2
+#>   file_id                              unharmonised   
+#>   <chr>                                <list>         
+#> 1 63523aa3-0d04-4fc6-ac59-5cadd3e73a14 <tbl_dck_[,17]>
+#> 2 8fee7b82-178b-4c04-bf23-04689415690d <tbl_dck_[,12]>
+#> 3 dc9d8cdd-29ee-4c44-830c-6559cb3d0af6 <tbl_dck_[,14]>
+#> 4 f7e94dbb-8638-4616-aaf9-16e2212c369f <tbl_dck_[,14]>
 ```
 
-If we have metadata from the normal metadata table that is from a single
-dataset, we can even join this additional metadata into one big data
-frame:
+Notice that the columns differ between each dataset’s data frame:
 
 ``` r
-harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
-dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
-#> # A tibble: 168,860 × 77
-#>    cell_ sample_ cell_…¹ cell_…² confi…³ cell_…⁴ cell_…⁵ cell_…⁶ sampl…⁷ _samp…⁸
-#>    <chr> <chr>   <chr>   <chr>     <dbl> <chr>   <chr>   <chr>   <chr>   <chr>  
-#>  1 GGAC… f63cb4… L2/3-6… neuron        1 <NA>    <NA>    <NA>    168593… H21.33…
-#>  2 TCAC… 0d4d1f… L2/3-6… neuron        1 <NA>    <NA>    <NA>    f7d747… H21.33…
-#>  3 TCAG… 3e5a3b… L2/3-6… neuron        1 <NA>    <NA>    <NA>    3417a9… H20.33…
-#>  4 TCAG… 7010a3… L2/3-6… neuron        1 <NA>    <NA>    <NA>    246a59… H20.33…
-#>  5 AGCC… 82bb9a… L2/3-6… neuron        1 <NA>    <NA>    <NA>    7a8f35… H21.33…
-#>  6 CCTC… a233eb… L2/3-6… neuron        1 <NA>    <NA>    <NA>    188243… H18.30…
-#>  7 CTCG… 27f104… L2/3-6… neuron        1 <NA>    <NA>    <NA>    a62943… H20.33…
-#>  8 AGCT… 0190a2… L2/3-6… neuron        1 <NA>    <NA>    <NA>    c508a8… H20.33…
-#>  9 CTCG… 95d846… L2/3-6… neuron        1 <NA>    <NA>    <NA>    29285d… H21.33…
-#> 10 AGTG… b0e1c5… L2/3-6… neuron        1 <NA>    <NA>    <NA>    cd7823… H21.33…
-#> # … with 168,850 more rows, 67 more variables: assay <chr>,
-#> #   assay_ontology_term_id <chr>, file_id_db <chr>,
-#> #   cell_type_ontology_term_id <chr>, development_stage <chr>,
-#> #   development_stage_ontology_term_id <chr>, disease <chr>,
-#> #   disease_ontology_term_id <chr>, ethnicity <chr>,
-#> #   ethnicity_ontology_term_id <chr>, experiment___ <chr>, file_id <chr>,
-#> #   is_primary_data_x <chr>, organism <chr>, organism_ontology_term_id <chr>, …
+dplyr::pull(unharmonised, unharmonised) |> head(2)
+#> [[1]]
+#> # Source:   SQL [?? x 17]
+#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:]
+#>    cell_ file_id donor…¹ donor…² libra…³ mappe…⁴ sampl…⁵ suspe…⁶ suspe…⁷ autho…⁸
+#>    <chr> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
+#>  1 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  2 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  3 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  4 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  5 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  6 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  7 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  8 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#>  9 4602… 63523a… 19 mon… 463181… 671785… GENCOD… 125234… cell    c7485e… CD4 T …
+#> 10 4602… 63523a… 27 mon… a8536b… 5ddaea… GENCOD… 61bf84… cell    d8a44f… Pelvic…
+#> # … with more rows, 7 more variables: cell_state <chr>,
+#> #   reported_diseases <chr>, Short_Sample <chr>, Project <chr>,
+#> #   Experiment <chr>, compartment <chr>, broad_celltype <chr>, and abbreviated
+#> #   variable names ¹​donor_age, ²​donor_uuid, ³​library_uuid,
+#> #   ⁴​mapped_reference_annotation, ⁵​sample_uuid, ⁶​suspension_type,
+#> #   ⁷​suspension_uuid, ⁸​author_cell_type
+#> 
+#> [[2]]
+#> # Source:   SQL [?? x 12]
+#> # Database: DuckDB 0.6.2-dev1166 [unknown@Linux 3.10.0-1160.81.1.el7.x86_64:R 4.2.1/:memory:]
+#>    cell_ file_id orig.…¹ nCoun…² nFeat…³ seura…⁴ Project donor…⁵ compa…⁶ broad…⁷
+#>    <chr> <chr>   <chr>     <dbl> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
+#>  1 1069  8fee7b… 4602ST…   16082 3997    25      Experi… Wilms3  non_PT  Pelvic…
+#>  2 1214  8fee7b… 4602ST…    1037 606     25      Experi… Wilms3  non_PT  Pelvic…
+#>  3 2583  8fee7b… 4602ST…    3028 1361    25      Experi… Wilms3  non_PT  Pelvic…
+#>  4 2655  8fee7b… 4602ST…    1605 859     25      Experi… Wilms3  non_PT  Pelvic…
+#>  5 3609  8fee7b… 4602ST…    1144 682     25      Experi… Wilms3  non_PT  Pelvic…
+#>  6 3624  8fee7b… 4602ST…    1874 963     25      Experi… Wilms3  non_PT  Pelvic…
+#>  7 3946  8fee7b… 4602ST…    1296 755     25      Experi… Wilms3  non_PT  Pelvic…
+#>  8 5163  8fee7b… 4602ST…   11417 3255    25      Experi… Wilms3  non_PT  Pelvic…
+#>  9 5446  8fee7b… 4602ST…    1769 946     19      Experi… Wilms2  lympho… CD4 T …
+#> 10 6275  8fee7b… 4602ST…    3750 1559    25      Experi… Wilms3  non_PT  Pelvic…
+#> # … with more rows, 2 more variables: author_cell_type <chr>, Sample <chr>, and
+#> #   abbreviated variable names ¹​orig.ident, ²​nCount_RNA, ³​nFeature_RNA,
+#> #   ⁴​seurat_clusters, ⁵​donor_id, ⁶​compartment, ⁷​broad_celltype
 ```
 
 # Cell metadata

From e0f2a8d331e38c349ff2066ddd69fc1b2f9c0bea Mon Sep 17 00:00:00 2001
From: Michael Milton <ttmigueltt@gmail.com>
Date: Thu, 9 Mar 2023 17:13:33 +1100
Subject: [PATCH 7/7] Re-run document

---
 NAMESPACE                       | 1 +
 man/get_unharmonised_dataset.Rd | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index cb89968..29842b8 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -25,6 +25,7 @@ importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
 importFrom(cli,cli_alert_warning)
+importFrom(dbplyr,remote_con)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
 importFrom(dplyr,filter)
diff --git a/man/get_unharmonised_dataset.Rd b/man/get_unharmonised_dataset.Rd
index e79ee8e..0821a23 100644
--- a/man/get_unharmonised_dataset.Rd
+++ b/man/get_unharmonised_dataset.Rd
@@ -41,9 +41,11 @@ make sense for these to live in the main metadata table. This function is a
 utility that allows easy fetching of this data if necessary.
 }
 \examples{
+\dontrun{
 dataset = "838ea006-2369-4e2c-b426-b2a744a2b02b"
 harmonised_meta = get_metadata() |> dplyr::filter(file_id == dataset) |> dplyr::collect()
 unharmonised_meta = get_unharmonised_dataset(dataset)
 unharmonised_tbl = dplyr::collect(unharmonised_meta[[dataset]])
 dplyr::left_join(harmonised_meta, unharmonised_tbl, by=c("file_id", "cell_"))
 }
+}