Skip to content

Commit

Permalink
Merge pull request #47 from bsweger/bsweger/skip_data_checks_option
Browse files Browse the repository at this point in the history
  • Loading branch information
annakrystalli authored Jul 26, 2024
2 parents 1b9e98f + 762a001 commit 4f79c69
Show file tree
Hide file tree
Showing 13 changed files with 760 additions and 28 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@
^data-raw$
^vignettes/articles$
^attic$
^\.vscode$
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.Rproj.user
.Rprofile
.DS_Store
.Rhistory
.Rdata
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: hubData
Title: Tools for accessing and working with hubverse data
Version: 1.1.1
Version: 1.2.0
Authors@R:
c(person("Anna", "Krystalli", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-2378-4915")),
Expand Down
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# hubData 1.2.0

* Adds a `skip_checks` parameter to the `connect_hub` and `connect_model_output` functions. When `skip_checks` is set to `TRUE`, these functions will bypass the default behavior of scanning the hub's model output directory for invalid files. Omitting these checks results in better performance when connecting to cloud-based hubs but can result in errors when querying the data. This option is only valid when connecting to hubs that meet the following criteria:
- the model output directory contains only model output data (no `README.md`, for example)
- the model output files use a single file format.

# hubData 1.1.1

* Fix {tidyselect} warnings by converting internal syntax
Expand Down
61 changes: 46 additions & 15 deletions R/connect_hub.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@
#' `admin.json` and is ignored by default.
#' If supplied, it will override hub configuration setting. Multiple formats can
#' be supplied to `connect_hub` but only a single file format can be supplied to `connect_mod_out`.
#' @param skip_checks Logical. If `FALSE` (default), check file_format parameter against the
#' hub's model output files. Also excludes invalid model output files when opening hub datasets.
#' Setting to `TRUE`` will improve performance but will result in an error if the model output
#' directory includes invalid files. Cannot be `TRUE` when there are multiple file formats in
#' the hub's model output directory or when the hub's model output directory contains files that
#' are not model output data (for example, a README).
#' @inheritParams create_hub_schema
#'
#' @return
Expand Down Expand Up @@ -75,7 +81,8 @@ connect_hub <- function(hub_path,
"double", "integer",
"logical", "Date"
),
partitions = list(model_id = arrow::utf8())) {
partitions = list(model_id = arrow::utf8()),
skip_checks = FALSE) {
UseMethod("connect_hub")
}

Expand All @@ -88,7 +95,8 @@ connect_hub.default <- function(hub_path,
"double", "integer",
"logical", "Date"
),
partitions = list(model_id = arrow::utf8())) {
partitions = list(model_id = arrow::utf8()),
skip_checks = FALSE) {
rlang::check_required(hub_path)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

Expand All @@ -112,8 +120,15 @@ connect_hub.default <- function(hub_path,
}
hub_name <- config_admin$name

# Only keep file formats of which files actually exist in model_output_dir.
file_format <- check_file_format(model_output_dir, file_format)
file_format <- check_file_format(model_output_dir, file_format, skip_checks)

# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files <- TRUE
}

if (length(file_format) == 0L) {
dataset <- list()
Expand All @@ -123,7 +138,8 @@ connect_hub.default <- function(hub_path,
file_format = file_format,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions
partitions = partitions,
exclude_invalid_files = exclude_invalid_files
)
}
if (inherits(dataset, "UnionDataset")) {
Expand All @@ -145,6 +161,7 @@ connect_hub.default <- function(hub_path,
class = c("hub_connection", class(dataset)),
hub_name = hub_name,
file_format = file_format,
checks = exclude_invalid_files,
file_system = file_system,
hub_path = hub_path,
model_output_dir = model_output_dir,
Expand All @@ -165,7 +182,8 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
"logical",
"Date"
),
partitions = list(model_id = arrow::utf8())) {
partitions = list(model_id = arrow::utf8()),
skip_checks = FALSE) {
rlang::check_required(hub_path)
output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)

Expand All @@ -187,8 +205,15 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
}
hub_name <- config_admin$name

# Only keep file formats of which files actually exist in model_output_dir.
file_format <- check_file_format(model_output_dir, file_format)
file_format <- check_file_format(model_output_dir, file_format, skip_checks)

# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files <- TRUE
}

if (length(file_format) == 0L) {
dataset <- list()
Expand All @@ -198,7 +223,8 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
file_format = file_format,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions
partitions = partitions,
exclude_invalid_files = exclude_invalid_files
)
}

Expand All @@ -221,6 +247,7 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
class = c("hub_connection", class(dataset)),
hub_name = hub_name,
file_format = file_format,
checks = exclude_invalid_files,
file_system = file_system,
hub_path = hub_path$base_path,
model_output_dir = model_output_dir$base_path,
Expand All @@ -238,7 +265,8 @@ open_hub_dataset <- function(model_output_dir,
"double", "integer",
"logical", "Date"
),
partitions = list(model_id = arrow::utf8())) {
partitions = list(model_id = arrow::utf8()),
exclude_invalid_files) {
file_format <- rlang::arg_match(file_format)
schema <- create_hub_schema(config_tasks,
partitions = partitions,
Expand All @@ -253,23 +281,23 @@ open_hub_dataset <- function(model_output_dir,
col_types = schema,
unify_schemas = FALSE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
),
parquet = arrow::open_dataset(
model_output_dir,
format = "parquet",
partitioning = "model_id",
schema = schema,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
),
arrow = arrow::open_dataset(
model_output_dir,
format = "arrow",
partitioning = "model_id",
schema = schema,
unify_schemas = FALSE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
)
}
Expand All @@ -284,14 +312,16 @@ open_hub_datasets <- function(model_output_dir,
"logical", "Date"
),
partitions = list(model_id = arrow::utf8()),
exclude_invalid_files,
call = rlang::caller_env()) {
if (length(file_format) == 1L) {
open_hub_dataset(
model_output_dir = model_output_dir,
file_format = file_format,
config_tasks = config_tasks,
output_type_id_datatype,
partitions = partitions
partitions = partitions,
exclude_invalid_files
)
} else {
cons <- purrr::map(
Expand All @@ -301,7 +331,8 @@ open_hub_datasets <- function(model_output_dir,
file_format = .x,
config_tasks = config_tasks,
output_type_id_datatype = output_type_id_datatype,
partitions = partitions
partitions = partitions,
exclude_invalid_files
)
)

Expand Down
41 changes: 32 additions & 9 deletions R/connect_model_output.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,33 @@
connect_model_output <- function(model_output_dir,
file_format = c("csv", "parquet", "arrow"),
partition_names = "model_id",
schema = NULL) {
schema = NULL,
skip_checks = FALSE) {
UseMethod("connect_model_output")
}

#' @export
connect_model_output.default <- function(model_output_dir,
file_format = c("csv", "parquet", "arrow"),
partition_names = "model_id",
schema = NULL) {
schema = NULL,
skip_checks = FALSE) {
rlang::check_required(model_output_dir)
if (!dir.exists(model_output_dir)) {
cli::cli_abort(c("x" = "Directory {.path {model_output_dir}} does not exist."))
}

file_format <- rlang::arg_match(file_format)
# Only keep file formats of which files actually exist in model_output_dir.
file_format <- check_file_format(model_output_dir, file_format, error = TRUE)
file_format <- check_file_format(model_output_dir, file_format, skip_checks, error = TRUE)

# Based on skip_checks param set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files <- TRUE
}

if (file_format == "csv") {
dataset <- arrow::open_dataset(
Expand All @@ -35,7 +46,7 @@ connect_model_output.default <- function(model_output_dir,
col_types = schema,
unify_schemas = TRUE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
} else {
dataset <- arrow::open_dataset(
Expand All @@ -44,7 +55,7 @@ connect_model_output.default <- function(model_output_dir,
partitioning = partition_names,
schema = schema,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
}

Expand All @@ -55,6 +66,7 @@ connect_model_output.default <- function(model_output_dir,
structure(dataset,
class = c("mod_out_connection", class(dataset)),
file_format = file_format,
checks = exclude_invalid_files,
file_system = class(dataset$filesystem)[1],
model_output_dir = model_output_dir
)
Expand All @@ -64,11 +76,21 @@ connect_model_output.default <- function(model_output_dir,
connect_model_output.SubTreeFileSystem <- function(model_output_dir,
file_format = c("csv", "parquet", "arrow"),
partition_names = "model_id",
schema = NULL) {
schema = NULL,
skip_checks = FALSE) {
rlang::check_required(model_output_dir)

file_format <- rlang::arg_match(file_format)
# Only keep file formats of which files actually exist in model_output_dir.
file_format <- check_file_format(model_output_dir, file_format, error = TRUE)
file_format <- check_file_format(model_output_dir, file_format, skip_checks, error = TRUE)

# Based on skip_checks param, set a flag that determines whether or not to
# check for invalid files when opening model output data.
if (isTRUE(skip_checks)) {
exclude_invalid_files <- FALSE
} else {
exclude_invalid_files <- TRUE
}

if (file_format == "csv") {
dataset <- arrow::open_dataset(
Expand All @@ -78,7 +100,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
schema = schema,
unify_schemas = TRUE,
strings_can_be_null = TRUE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
} else {
dataset <- arrow::open_dataset(
Expand All @@ -87,7 +109,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
partitioning = partition_names,
schema = schema,
unify_schemas = TRUE,
factory_options = list(exclude_invalid_files = TRUE)
factory_options = list(exclude_invalid_files = exclude_invalid_files)
)
}

Expand All @@ -99,6 +121,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
structure(dataset,
class = c("mod_out_connection", class(dataset)),
file_format = file_format,
checks = exclude_invalid_files,
file_system = class(dataset$filesystem$base_fs)[1],
model_output_dir = model_output_dir$base_path
)
Expand Down
10 changes: 10 additions & 0 deletions R/print.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,11 @@ print.hub_connection <- function(x, verbose = FALSE, ...) {
"*" = "file_format: {.val {print_file_format_meta(x)}}"
)
}
if (!is.null(attr(x, "checks"))) {
print_msg <- c(print_msg,
"*" = "checks: {.val {attr(x, 'checks')}}"
)
}
if (!is.null(attr(x, "file_system"))) {
print_msg <- c(print_msg,
"*" = "file_system: {.val {attr(x, 'file_system')[1]}}"
Expand Down Expand Up @@ -79,6 +84,11 @@ print.mod_out_connection <- function(x, verbose = FALSE, ...) {
"*" = "file_format: {.val {print_file_format_meta(x)}}"
)
}
if (!is.null(attr(x, "checks"))) {
print_msg <- c(print_msg,
"*" = "checks: {.val {attr(x, 'checks')}}"
)
}
if (!is.null(attr(x, "file_system"))) {
print_msg <- c(print_msg,
"*" = "file_system: {.val {attr(x, 'file_system')}}"
Expand Down
9 changes: 8 additions & 1 deletion R/utils-connect_hub.R
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ get_file_format_meta <- function(dataset, model_output_dir, file_format) {
rbind(n_open, n_in_dir)
}

check_file_format <- function(model_output_dir, file_format,
check_file_format <- function(model_output_dir, file_format, skip_checks,
call = rlang::caller_env(), error = FALSE) {
dir_file_formats <- get_dir_file_formats(model_output_dir)
valid_file_format <- file_format[file_format %in% dir_file_formats]
Expand All @@ -121,6 +121,13 @@ check_file_format <- function(model_output_dir, file_format,
call = call
)
}
if (length(dir_file_formats) > 1L && isTRUE(skip_checks)) {
cli::cli_abort("Skip_checks cannot be TRUE when there
are multiple file formats in the model output directory
({.val {dir_file_formats}}).",
call = call
)
}
valid_file_format
}

Expand Down
11 changes: 9 additions & 2 deletions man/connect_hub.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4f79c69

Please sign in to comment.