diff --git a/DESCRIPTION b/DESCRIPTION index 6ed5bdd..88d82dc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,32 +1,35 @@ Package: phsopendata Title: Extract Open Data from opendata.nhs.scot Version: 0.1.0 -Authors@R: - c(person(given = "Csilla", family = "Scharle", role = c("aut", "cre"), email = "csilla.scharle2@phs.scot"), - person("James", "McMahon", email = "james.mcmahon@phs.scot", role = "aut"), - person("David", "Aikman", email = "david.aikman@phs.scot", role = "aut")) -Description: Functions to extract and interact with data from the Scottish Health and Social Care Open Data platform. +Authors@R: c( + person("Csilla", "Scharle", , "csilla.scharle2@phs.scot", role = c("aut", "cre")), + person("James", "McMahon", , "james.mcmahon@phs.scot", role = "aut"), + person("David", "Aikman", , "david.aikman@phs.scot", role = "aut") + ) +Description: Functions to extract and interact with data from the Scottish + Health and Social Care Open Data platform. License: GPL (>= 2) +URL: https://github.com/Public-Health-Scotland/phsopendata, + https://public-health-scotland.github.io/phsopendata/ +BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues Imports: + cli, dplyr (>= 1.0.0), - httr (>= 1.0.0), glue (>= 1.0.0), - purrr, - tibble (>= 3.0.0), + httr (>= 1.0.0), jsonlite (>= 1.0.0), magrittr (>= 1.0.0), + purrr, readr (>= 1.0.0), stringdist, - cli, + tibble (>= 3.0.0), xml2 Suggests: covr, testthat (>= 3.0.0) +Config/testthat/edition: 3 +Config/testthat/parallel: true Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.1 -Config/testthat/edition: 3 -URL: https://github.com/Public-Health-Scotland/phsopendata, - https://public-health-scotland.github.io/phsopendata/ -BugReports: https://github.com/Public-Health-Scotland/phsopendata/issues diff --git a/R/add_context.R b/R/add_context.R new file mode 100644 index 0000000..c473b4d --- /dev/null +++ b/R/add_context.R @@ -0,0 +1,28 @@ +add_context <- function(data, id, name, created_date, modified_date) { + # Catch if the resource has never been modified + if (is.null(modified_date)) { + modified_date <- NA_character_ + } + + # Parse the date values + created_date <- as.POSIXct(created_date, format = "%FT%X", tz = "UTC") + modified_date <- as.POSIXct(modified_date, format = "%FT%X", tz = "UTC") + + # The platform can record the modified date as being before the created date + # by a few microseconds, this will catch any rounding which ensure + # created_date is always <= modified_date + if (modified_date < created_date) { + modified_date <- created_date + } + + data_with_context <- dplyr::mutate( + data, + "ResID" = id, + "ResName" = name, + "ResCreatedDate" = created_date, + "ResModifiedDate" = modified_date, + .before = dplyr::everything() + ) + + return(data_with_context) +} diff --git a/R/get_dataset.R b/R/get_dataset.R index bc93a93..98cef2f 100644 --- a/R/get_dataset.R +++ b/R/get_dataset.R @@ -7,18 +7,21 @@ #' it will return the n latest resources #' @param rows (optional) specify the max number of rows #' to return for each resource. +#' @inheritParams get_resource #' #' @seealso [get_resource()] for downloading a single resource #' from a dataset. #' -#' @importFrom magrittr %>% #' @return a [tibble][tibble::tibble-package] with the data #' @export #' #' @examples get_dataset("gp-practice-populations", #' max_resources = 2, rows = 10 #' ) -get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) { +get_dataset <- function(dataset_name, + max_resources = NULL, + rows = NULL, + include_context = FALSE) { # throw error if name type/format is invalid check_dataset_name(dataset_name) @@ -37,13 +40,15 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) { # define list of resource IDs to get all_ids <- purrr::map_chr(content$result$resources, ~ .x$id) + n_res <- length(all_ids) res_index <- 1:min(n_res, max_resources) - ids_selection <- all_ids[res_index] + + selection_ids <- all_ids[res_index] # get all resources all_data <- purrr::map( - ids_selection, + selection_ids, get_resource, rows = rows ) @@ -51,9 +56,10 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) { # resolve class issues types <- purrr::map( all_data, - ~ unlist(lapply(.x, class)) + ~ purrr::map_chr(.x, class) ) + # for each df, check if next df class matches inconsistencies <- vector(length = length(types) - 1, mode = "list") for (i in seq_along(types)) { @@ -73,8 +79,7 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) { } # define which columns to coerce and warn - conflicts <- unlist(inconsistencies) - to_coerce <- unique(names(conflicts)) + to_coerce <- unique(names(unlist(inconsistencies))) if (length(to_coerce) > 0) { cli::cli_warn(c( @@ -82,19 +87,35 @@ get_dataset <- function(dataset_name, max_resources = NULL, rows = NULL) { the following {cli::qty(to_coerce)} column{?s} ha{?s/ve} been coerced to type character:", "{.val {to_coerce}}" )) - } - # combine - combined <- purrr::map_df( - all_data, - ~ dplyr::mutate( - .x, - dplyr::across( - dplyr::any_of(to_coerce), - as.character + all_data <- purrr::map( + all_data, + ~ dplyr::mutate( + .x, + dplyr::across( + dplyr::any_of(to_coerce), + as.character + ) ) ) - ) + } + + if (include_context) { + # Add the 'resource context' as columns to the data + all_data <- purrr::pmap( + list( + "data" = all_data, + "id" = selection_ids, + "name" = purrr::map_chr(content$result$resources[res_index], ~ .x$name), + "created_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$created), + "modified_date" = purrr::map_chr(content$result$resources[res_index], ~ .x$last_modified) + ), + add_context + ) + } + + # Combine the list of resources into a single tibble + combined <- purrr::list_rbind(all_data) return(combined) } diff --git a/R/get_resource.R b/R/get_resource.R index 3b1b0cf..0631e18 100644 --- a/R/get_resource.R +++ b/R/get_resource.R @@ -3,10 +3,15 @@ #' @param res_id The resource ID as found on #' \href{https://www.opendata.nhs.scot/}{NHS Open Data platform} #' @param rows (optional) specify the max number of rows to return. -#' @param row_filters (optional) a named list or vector that specifies values of columns/fields to keep. +#' @param row_filters (optional) a named list or vector that specifies values of +#' columns/fields to keep. #' e.g. list(Date = 20220216, Sex = "Female"). -#' @param col_select (optional) a character vector containing the names of desired columns/fields. +#' @param col_select (optional) a character vector containing the names of +#' desired columns/fields. #' e.g. c("Date", "Sex"). +#' @param include_context (optional) If `TRUE` additional information about the +#' resource will be added as columns to the data, including the resource ID, the +#' resource name, the creation date and the last modified/updated date. #' #' @seealso [get_dataset()] for downloading all resources #' from a given dataset. @@ -17,11 +22,22 @@ #' #' @examples #' res_id <- "ca3f8e44-9a84-43d6-819c-a880b23bd278" +#' +#' data <- get_resource(res_id) +#' #' filters <- list("HB" = "S08000030", "Month" = "202109") #' wanted_cols <- c("HB", "Month", "TotalPatientsSeen") #' -#' df <- get_resource(res_id = res_id, row_filters = filters, col_select = wanted_cols) -get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = NULL) { +#' filtered_data <- get_resource( +#' res_id = res_id, +#' row_filters = filters, +#' col_select = wanted_cols +#' ) +get_resource <- function(res_id, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = FALSE) { # check res_id check_res_id(res_id) @@ -35,51 +51,71 @@ get_resource <- function(res_id, rows = NULL, row_filters = NULL, col_select = N # if dump should be used, use it if (use_dump_check(query, rows)) { - return(dump_download(res_id)) - } + data <- dump_download(res_id) + } else { + # if there is no row limit set + # set limit to CKAN max + if (is.null(query$limit)) query$limit <- 99999 - # if there is no row limit set - # set limit to CKAN max - if (is.null(query$limit)) query$limit <- 99999 + # remove null values from query + null_q_field <- sapply(query, is.null) + query[null_q_field] <- NULL - # remove null values from query - null_q_field <- sapply(query, is.null) - query[null_q_field] <- NULL + # fetch the data + res_content <- phs_GET("datastore_search", query) - # fetch the data - res_content <- phs_GET("datastore_search", query) - - # if the total number of rows is greater than the - # number of rows fetched - # AND the user was not aware of this limit (`rows` defaulted to NULL) - # warn the user about this limit. - total_rows <- res_content$result$total - if (is.null(rows) && query$limit < total_rows) { - cli::cli_warn(c( - "Returning the first {query$limit} + # if the total number of rows is greater than the + # number of rows fetched + # AND the user was not aware of this limit (`rows` defaulted to NULL) + # warn the user about this limit. + total_rows <- res_content$result$total + if (is.null(rows) && query$limit < total_rows) { + cli::cli_warn(c( + "Returning the first {query$limit} results (rows) of your query. {total_rows} rows match your query in total.", - i = "To get ALL matching rows you will need to download + i = "To get ALL matching rows you will need to download the whole resource and apply filters/selections locally." - )) - } + )) + } - # if more rows were requested than received - # let the user know - if (!is.null(rows) && query$limit > total_rows) { - cli::cli_alert_warning(c( - "You set {.var rows} to {query$limit} but - only {total_rows} rows matched your query." - )) + # if more rows were requested than received + # let the user know + if (!is.null(rows) && query$limit > total_rows) { + cli::cli_warn( + "You set {.var rows} to {.val {query$limit}} but only {.val {total_rows}} rows matched your query." + ) + } + + # extract data from response content + data <- purrr::map_dfr( + res_content$result$records, ~.x + ) %>% dplyr::select( + -dplyr::starts_with("rank "), + -dplyr::matches("_id") + ) } - # extract data from response content - data <- purrr::map_dfr( - res_content$result$records, ~.x - ) %>% dplyr::select( - -dplyr::starts_with("rank "), - -dplyr::matches("_id") - ) + if (include_context) { + # Get resource context if required + context_content <- phs_GET( + action = "resource_show", + query = paste0("id=", res_id) + ) + + res_id <- context_content$result$id + res_name <- context_content$result$name + res_created_date <- context_content$result$created + res_modified_date <- context_content$result$last_modified + + data <- data %>% + add_context( + id = res_id, + name = res_name, + created_date = res_created_date, + modified_date = res_modified_date + ) + } return(data) } diff --git a/R/request_url.R b/R/request_url.R index c789983..d6a19ea 100644 --- a/R/request_url.R +++ b/R/request_url.R @@ -6,11 +6,18 @@ #' request_url <- function(action, query) { # check action is valid - valid_actions <- c("datastore_search", "datastore_search_sql", "dump", "package_show", "package_list") + valid_actions <- c( + "datastore_search", + "datastore_search_sql", + "dump", + "package_show", + "package_list", + "resource_show" + ) if (!(action %in% valid_actions)) { cli::cli_abort(c( "API call failed.", - x = "Invalid {.var action} argument in request." + x = "{.val {action}} is an invalid {.arg action} argument." )) } diff --git a/man/get_dataset.Rd b/man/get_dataset.Rd index c12995a..855f102 100644 --- a/man/get_dataset.Rd +++ b/man/get_dataset.Rd @@ -4,7 +4,12 @@ \alias{get_dataset} \title{Get Open Data resources from a dataset} \usage{ -get_dataset(dataset_name, max_resources = NULL, rows = NULL) +get_dataset( + dataset_name, + max_resources = NULL, + rows = NULL, + include_context = FALSE +) } \arguments{ \item{dataset_name}{name of the dataset as found on @@ -16,6 +21,10 @@ it will return the n latest resources} \item{rows}{(optional) specify the max number of rows to return for each resource.} + +\item{include_context}{(optional) If \code{TRUE} additional information about the +resource will be added as columns to the data, including the resource ID, the +resource name, the creation date and the last modified/updated date.} } \value{ a \link[tibble:tibble-package]{tibble} with the data diff --git a/man/get_resource.Rd b/man/get_resource.Rd index 52118e1..8fa1fcf 100644 --- a/man/get_resource.Rd +++ b/man/get_resource.Rd @@ -4,7 +4,13 @@ \alias{get_resource} \title{Get Open Data resource} \usage{ -get_resource(res_id, rows = NULL, row_filters = NULL, col_select = NULL) +get_resource( + res_id, + rows = NULL, + row_filters = NULL, + col_select = NULL, + include_context = FALSE +) } \arguments{ \item{res_id}{The resource ID as found on @@ -12,11 +18,17 @@ get_resource(res_id, rows = NULL, row_filters = NULL, col_select = NULL) \item{rows}{(optional) specify the max number of rows to return.} -\item{row_filters}{(optional) a named list or vector that specifies values of columns/fields to keep. +\item{row_filters}{(optional) a named list or vector that specifies values of +columns/fields to keep. e.g. list(Date = 20220216, Sex = "Female").} -\item{col_select}{(optional) a character vector containing the names of desired columns/fields. +\item{col_select}{(optional) a character vector containing the names of +desired columns/fields. e.g. c("Date", "Sex").} + +\item{include_context}{(optional) If \code{TRUE} additional information about the +resource will be added as columns to the data, including the resource ID, the +resource name, the creation date and the last modified/updated date.} } \value{ a \link[tibble:tibble-package]{tibble} with the data @@ -26,10 +38,17 @@ Get Open Data resource } \examples{ res_id <- "ca3f8e44-9a84-43d6-819c-a880b23bd278" + +data <- get_resource(res_id) + filters <- list("HB" = "S08000030", "Month" = "202109") wanted_cols <- c("HB", "Month", "TotalPatientsSeen") -df <- get_resource(res_id = res_id, row_filters = filters, col_select = wanted_cols) +filtered_data <- get_resource( + res_id = res_id, + row_filters = filters, + col_select = wanted_cols +) } \seealso{ \code{\link[=get_dataset]{get_dataset()}} for downloading all resources diff --git a/phsopendata.Rproj b/phsopendata.Rproj index 69fafd4..e4e949a 100644 --- a/phsopendata.Rproj +++ b/phsopendata.Rproj @@ -20,3 +20,5 @@ BuildType: Package PackageUseDevtools: Yes PackageInstallArgs: --no-multiarch --with-keep.source PackageRoxygenize: rd,collate,namespace + +UseNativePipeOperator: No diff --git a/tests/testthat.R b/tests/testthat.R index d6ff61b..7a7ddb5 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,3 +1,11 @@ +# This file is part of the standard setup for testthat. +# It is recommended that you do not modify it. +# +# Where should you do additional test configuration? +# Learn more about the roles of various files in: +# * https://r-pkgs.org/testing-design.html#sec-tests-files-overview +# * https://testthat.r-lib.org/articles/special-files.html + library(testthat) library(phsopendata) diff --git a/tests/testthat/test-add_context.R b/tests/testthat/test-add_context.R new file mode 100644 index 0000000..c144ce0 --- /dev/null +++ b/tests/testthat/test-add_context.R @@ -0,0 +1,37 @@ +test_that("Returned context is the same for resource and dataset", { + dataset <- get_dataset( + "general-practitioner-contact-details", + rows = 10, + include_context = TRUE + ) + + res_id_1 <- "647b256e-4a03-4963-8402-bf559c9e2fff" + resource_1 <- get_resource( + res_id = res_id_1, + rows = 10, + include_context = TRUE + ) + + res_id_2 <- "e37c14fe-51f7-4935-87d1-c79b30fe8824" + resource_2 <- get_resource( + res_id = res_id_2, + rows = 10, + include_context = TRUE + ) + + expect_equal( + dataset %>% + dplyr::filter(ResID == res_id_1) %>% + dplyr::select(!dplyr::where(~ anyNA(.x))), + resource_1, + list_as_map = TRUE + ) + expect_equal( + dataset %>% + dplyr::filter(ResID == res_id_2) %>% + dplyr::select(!dplyr::where(~ anyNA(.x))), + resource_2, + list_as_map = TRUE + ) + # list_as_map = TRUE will sort variable names before comparing +}) diff --git a/tests/testthat/test-get_dataset.R b/tests/testthat/test-get_dataset.R index cab33b0..cbfb74f 100644 --- a/tests/testthat/test-get_dataset.R +++ b/tests/testthat/test-get_dataset.R @@ -1,13 +1,18 @@ +skip_if_offline(host = "www.opendata.nhs.scot") + test_that("returns data in the expected format", { + n_resources <- 2 + n_rows <- 2 data <- get_dataset( dataset_name = "gp-practice-populations", - max_resources = 2, - rows = 2 + max_resources = n_resources, + rows = n_rows ) expect_s3_class(data, "tbl_df") + expect_equal(nrow(data), n_resources * n_rows) expect_length(data, 24) - expect_equal(nrow(data), 2 * 2) + expect_named(data) }) test_that("errors properly", { diff --git a/tests/testthat/test-get_dataset_context.R b/tests/testthat/test-get_dataset_context.R new file mode 100644 index 0000000..065106a --- /dev/null +++ b/tests/testthat/test-get_dataset_context.R @@ -0,0 +1,33 @@ +skip_if_offline(host = "www.opendata.nhs.scot") + +test_that("returns expected context with the data", { + n_resources <- 2 + n_rows <- 2 + data_col_names <- names( + get_dataset( + dataset_name = "gp-practice-populations", + max_resources = 1, + rows = 1 + ) + ) + data <- get_dataset( + dataset_name = "gp-practice-populations", + max_resources = n_resources, + rows = n_rows, + include_context = TRUE + ) + + expect_s3_class(data, "tbl_df") + expect_type(data$ResID, "character") + expect_type(data$ResName, "character") + expect_s3_class(data$ResCreatedDate, "POSIXct") + expect_s3_class(data$ResModifiedDate, "POSIXct") + + expect_equal(nrow(data), n_resources * n_rows) + expect_length(data, 28) + expect_named( + data, + c("ResID", "ResName", "ResCreatedDate", "ResModifiedDate", data_col_names) + ) + expect_length(unique(data[["ResID"]]), n_resources) +}) diff --git a/tests/testthat/test-get_resource.R b/tests/testthat/test-get_resource.R index bdb44f6..128a0ce 100644 --- a/tests/testthat/test-get_resource.R +++ b/tests/testthat/test-get_resource.R @@ -1,3 +1,5 @@ +skip_if_offline(host = "www.opendata.nhs.scot") + test_that("returns data in the expected format", { gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" @@ -7,6 +9,7 @@ test_that("returns data in the expected format", { expect_s3_class(data, "tbl_df") expect_length(data, 15) expect_equal(nrow(data), 1) + expect_named(data) # with query data_q <- get_resource( @@ -15,63 +18,15 @@ test_that("returns data in the expected format", { col_select = c("PracticeCode", "AddressLine1") ) - expect_true(all(names(data_q) == c("PracticeCode", "AddressLine1"))) - expect_true(all(data_q$PracticeCode == 10002)) -}) - -test_that("checks res_id properly", { - # wrong type - expect_error( - get_resource(res_id = 123), - regexp = "(must be of type character)" - ) - # Invalid format (doesn't match regex) - expect_error( - get_resource("a794d603-95ab-4309-8c92-b48970478c1"), - regexp = "(is in an invalid format.)" - ) - # res_id is a vector of length > 1 - expect_error( - get_resource(1:5), - regexp = "(must be of length 1.)" - ) - # Correct format but not real - expect_error( - get_resource("00000000-0000-0000-0000-000000000000"), - regexp = "(Can't find resource with ID)" - ) -}) - -test_that("returns full data if only res_id is input", { - gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" - - data <- get_resource(res_id = gp_list_apr_2021) - - expect_equal(nrow(data), 926) + expect_named(data_q, c("PracticeCode", "AddressLine1")) + expect_equal(data_q[["PracticeCode"]], 10002) }) -test_that("returns full data if rows is set to over 99999", { +test_that("returns data with row specifications", { gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" - expect_warning( - data <- get_resource( - res_id = gp_list_apr_2021, - rows = 9999999, - row_filters = c("GPPracticeName" = "The Blue Practice") - ), - regexp = "Can't request over 99,999 rows" - ) - - expect_equal(nrow(data), 926) -}) - -test_that("first 99999 rows returned if query matches > 99999 rows", { - prescriptions_apr_2021 <- "51b7ad3f-6d52-4165-94f4-92e322656c85" - - expect_warning( - df <- get_resource(prescriptions_apr_2021, col_select = c("HBT")), - regexp = "(Returning the first 99999 results)" - ) + expect_equal(nrow(get_resource(res_id = gp_list_apr_2021, rows = 926)), 926) - expect_true(nrow(df) == 99999) + expect_equal(nrow(get_resource(res_id = gp_list_apr_2021, rows = 999)), 926) %>% + expect_warning() }) diff --git a/tests/testthat/test-get_resource_context.R b/tests/testthat/test-get_resource_context.R new file mode 100644 index 0000000..d6a97e7 --- /dev/null +++ b/tests/testthat/test-get_resource_context.R @@ -0,0 +1,47 @@ +skip_if_offline(host = "www.opendata.nhs.scot") + +test_that("returns expected context with the data", { + gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" + data_col_names <- names(get_resource(res_id = gp_list_apr_2021, rows = 1)) + + # without query + data <- get_resource( + res_id = gp_list_apr_2021, + rows = 10, + include_context = TRUE + ) + + expect_s3_class(data, "tbl_df") + expect_type(data$ResID, "character") + expect_type(data$ResName, "character") + expect_s3_class(data$ResCreatedDate, "POSIXct") + expect_s3_class(data$ResModifiedDate, "POSIXct") + + expect_length(data, 19) + expect_equal(nrow(data), 10) + expect_named(data, c( + "ResID", + "ResName", + "ResCreatedDate", + "ResModifiedDate", + data_col_names + )) + + # with query + data_q <- get_resource( + gp_list_apr_2021, + row_filters = list(PracticeCode = 10002), + col_select = c("PracticeCode", "AddressLine1"), + include_context = TRUE + ) + + expect_named(data_q, c( + "ResID", + "ResName", + "ResCreatedDate", + "ResModifiedDate", + "PracticeCode", + "AddressLine1" + )) + expect_equal(data_q[["PracticeCode"]], 10002) +}) diff --git a/tests/testthat/test-get_resource_dump.R b/tests/testthat/test-get_resource_dump.R new file mode 100644 index 0000000..624e5c3 --- /dev/null +++ b/tests/testthat/test-get_resource_dump.R @@ -0,0 +1,35 @@ +skip_if_offline(host = "www.opendata.nhs.scot") + +test_that("returns full data if only res_id is input", { + gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" + + data <- get_resource(res_id = gp_list_apr_2021) + + expect_equal(nrow(data), 926) +}) + +test_that("returns full data if rows is set to over 99999", { + gp_list_apr_2021 <- "a794d603-95ab-4309-8c92-b48970478c14" + + expect_warning( + data <- get_resource( + res_id = gp_list_apr_2021, + rows = 9999999, + row_filters = c("GPPracticeName" = "The Blue Practice") + ), + regexp = "Can't request over 99,999 rows" + ) + + expect_equal(nrow(data), 926) +}) + +test_that("first 99999 rows returned if query matches > 99999 rows", { + prescriptions_apr_2021 <- "51b7ad3f-6d52-4165-94f4-92e322656c85" + + expect_warning( + df <- get_resource(prescriptions_apr_2021, col_select = c("HBT")), + regexp = "(Returning the first 99999 results)" + ) + + expect_true(nrow(df) == 99999) +})