Merge pull request #47 from bsweger/bsweger/skip_data_checks_option

hubverse-org · Jul 26, 2024 · 4f79c69 · 4f79c69
2 parents 1b9e98f + 762a001
commit 4f79c69
Show file tree

Hide file tree

Showing 13 changed files with 760 additions and 28 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -17,3 +17,4 @@
 ^data-raw$
 ^vignettes/articles$
 ^attic$
+^\.vscode$
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .Rproj.user
+.Rprofile
 .DS_Store
 .Rhistory
 .Rdata

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: hubData
 Title: Tools for accessing and working with hubverse data
-Version: 1.1.1
+Version: 1.2.0
 Authors@R: 
     c(person("Anna", "Krystalli", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-2378-4915")),

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# hubData 1.2.0
+
+* Adds a `skip_checks` parameter to the `connect_hub` and `connect_model_output` functions. When `skip_checks` is set to `TRUE`, these functions will bypass the default behavior of scanning the hub's model output directory for invalid files. Omitting these checks results in better performance when connecting to cloud-based hubs but can result in errors when querying the data. This option is only valid when connecting to hubs that meet the following criteria:
+    - the model output directory contains only model output data (no `README.md`, for example)
+    - the model output files use a single file format.
+
 # hubData 1.1.1
 
 * Fix {tidyselect} warnings by converting internal syntax

diff --git a/R/connect_hub.R b/R/connect_hub.R
@@ -25,6 +25,12 @@
 #' `admin.json` and is ignored by default.
 #' If supplied, it will override hub configuration setting. Multiple formats can
 #' be supplied to `connect_hub` but only a single file format can be supplied to `connect_mod_out`.
+#' @param skip_checks Logical. If `FALSE` (default), check file_format parameter against the
+#' hub's model output files. Also excludes invalid model output files when opening hub datasets.
+#' Setting to `TRUE`` will improve performance but will result in an error if the model output
+#' directory includes invalid files. Cannot be `TRUE` when there are multiple file formats in
+#' the hub's model output directory or when the hub's model output directory contains files that
+#' are not model output data (for example, a README).
 #' @inheritParams create_hub_schema
 #'
 #' @return
@@ -75,7 +81,8 @@ connect_hub <- function(hub_path,
                           "double", "integer",
                           "logical", "Date"
                         ),
-                        partitions = list(model_id = arrow::utf8())) {
+                        partitions = list(model_id = arrow::utf8()),
+                        skip_checks = FALSE) {
   UseMethod("connect_hub")
 }
 
@@ -88,7 +95,8 @@ connect_hub.default <- function(hub_path,
                                   "double", "integer",
                                   "logical", "Date"
                                 ),
-                                partitions = list(model_id = arrow::utf8())) {
+                                partitions = list(model_id = arrow::utf8()),
+                                skip_checks = FALSE) {
   rlang::check_required(hub_path)
   output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
 
@@ -112,8 +120,15 @@ connect_hub.default <- function(hub_path,
   }
   hub_name <- config_admin$name
 
-  # Only keep file formats of which files actually exist in model_output_dir.
-  file_format <- check_file_format(model_output_dir, file_format)
+  file_format <- check_file_format(model_output_dir, file_format, skip_checks)
+
+  # Based on skip_checks param, set a flag that determines whether or not to
+  # check for invalid files when opening model output data.
+  if (isTRUE(skip_checks)) {
+    exclude_invalid_files <- FALSE
+  } else {
+    exclude_invalid_files <- TRUE
+  }
 
   if (length(file_format) == 0L) {
     dataset <- list()
@@ -123,7 +138,8 @@ connect_hub.default <- function(hub_path,
       file_format = file_format,
       config_tasks = config_tasks,
       output_type_id_datatype = output_type_id_datatype,
-      partitions = partitions
+      partitions = partitions,
+      exclude_invalid_files = exclude_invalid_files
     )
   }
   if (inherits(dataset, "UnionDataset")) {
@@ -145,6 +161,7 @@ connect_hub.default <- function(hub_path,
     class = c("hub_connection", class(dataset)),
     hub_name = hub_name,
     file_format = file_format,
+    checks = exclude_invalid_files,
     file_system = file_system,
     hub_path = hub_path,
     model_output_dir = model_output_dir,
@@ -165,7 +182,8 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
                                             "logical",
                                             "Date"
                                           ),
-                                          partitions = list(model_id = arrow::utf8())) {
+                                          partitions = list(model_id = arrow::utf8()),
+                                          skip_checks = FALSE) {
   rlang::check_required(hub_path)
   output_type_id_datatype <- rlang::arg_match(output_type_id_datatype)
 
@@ -187,8 +205,15 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
   }
   hub_name <- config_admin$name
 
-  # Only keep file formats of which files actually exist in model_output_dir.
-  file_format <- check_file_format(model_output_dir, file_format)
+  file_format <- check_file_format(model_output_dir, file_format, skip_checks)
+
+  # Based on skip_checks param, set a flag that determines whether or not to
+  # check for invalid files when opening model output data.
+  if (isTRUE(skip_checks)) {
+    exclude_invalid_files <- FALSE
+  } else {
+    exclude_invalid_files <- TRUE
+  }
 
   if (length(file_format) == 0L) {
     dataset <- list()
@@ -198,7 +223,8 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
       file_format = file_format,
       config_tasks = config_tasks,
       output_type_id_datatype = output_type_id_datatype,
-      partitions = partitions
+      partitions = partitions,
+      exclude_invalid_files = exclude_invalid_files
     )
   }
 
@@ -221,6 +247,7 @@ connect_hub.SubTreeFileSystem <- function(hub_path,
     class = c("hub_connection", class(dataset)),
     hub_name = hub_name,
     file_format = file_format,
+    checks = exclude_invalid_files,
     file_system = file_system,
     hub_path = hub_path$base_path,
     model_output_dir = model_output_dir$base_path,
@@ -238,7 +265,8 @@ open_hub_dataset <- function(model_output_dir,
                                "double", "integer",
                                "logical", "Date"
                              ),
-                             partitions = list(model_id = arrow::utf8())) {
+                             partitions = list(model_id = arrow::utf8()),
+                             exclude_invalid_files) {
   file_format <- rlang::arg_match(file_format)
   schema <- create_hub_schema(config_tasks,
     partitions = partitions,
@@ -253,23 +281,23 @@ open_hub_dataset <- function(model_output_dir,
       col_types = schema,
       unify_schemas = FALSE,
       strings_can_be_null = TRUE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     ),
     parquet = arrow::open_dataset(
       model_output_dir,
       format = "parquet",
       partitioning = "model_id",
       schema = schema,
       unify_schemas = FALSE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     ),
     arrow = arrow::open_dataset(
       model_output_dir,
       format = "arrow",
       partitioning = "model_id",
       schema = schema,
       unify_schemas = FALSE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     )
   )
 }
@@ -284,14 +312,16 @@ open_hub_datasets <- function(model_output_dir,
                                 "logical", "Date"
                               ),
                               partitions = list(model_id = arrow::utf8()),
+                              exclude_invalid_files,
                               call = rlang::caller_env()) {
   if (length(file_format) == 1L) {
     open_hub_dataset(
       model_output_dir = model_output_dir,
       file_format = file_format,
       config_tasks = config_tasks,
       output_type_id_datatype,
-      partitions = partitions
+      partitions = partitions,
+      exclude_invalid_files
     )
   } else {
     cons <- purrr::map(
@@ -301,7 +331,8 @@ open_hub_datasets <- function(model_output_dir,
         file_format = .x,
         config_tasks = config_tasks,
         output_type_id_datatype = output_type_id_datatype,
-        partitions = partitions
+        partitions = partitions,
+        exclude_invalid_files
       )
     )
 

diff --git a/R/connect_model_output.R b/R/connect_model_output.R
@@ -10,22 +10,33 @@
 connect_model_output <- function(model_output_dir,
                                  file_format = c("csv", "parquet", "arrow"),
                                  partition_names = "model_id",
-                                 schema = NULL) {
+                                 schema = NULL,
+                                 skip_checks = FALSE) {
   UseMethod("connect_model_output")
 }
 
 #' @export
 connect_model_output.default <- function(model_output_dir,
                                          file_format = c("csv", "parquet", "arrow"),
                                          partition_names = "model_id",
-                                         schema = NULL) {
+                                         schema = NULL,
+                                         skip_checks = FALSE) {
   rlang::check_required(model_output_dir)
   if (!dir.exists(model_output_dir)) {
     cli::cli_abort(c("x" = "Directory {.path {model_output_dir}} does not exist."))
   }
+
   file_format <- rlang::arg_match(file_format)
   # Only keep file formats of which files actually exist in model_output_dir.
-  file_format <- check_file_format(model_output_dir, file_format, error = TRUE)
+  file_format <- check_file_format(model_output_dir, file_format, skip_checks, error = TRUE)
+
+  # Based on skip_checks param set a flag that determines whether or not to
+  # check for invalid files when opening model output data.
+  if (isTRUE(skip_checks)) {
+    exclude_invalid_files <- FALSE
+  } else {
+    exclude_invalid_files <- TRUE
+  }
 
   if (file_format == "csv") {
     dataset <- arrow::open_dataset(
@@ -35,7 +46,7 @@ connect_model_output.default <- function(model_output_dir,
       col_types = schema,
       unify_schemas = TRUE,
       strings_can_be_null = TRUE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     )
   } else {
     dataset <- arrow::open_dataset(
@@ -44,7 +55,7 @@ connect_model_output.default <- function(model_output_dir,
       partitioning = partition_names,
       schema = schema,
       unify_schemas = TRUE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     )
   }
 
@@ -55,6 +66,7 @@ connect_model_output.default <- function(model_output_dir,
   structure(dataset,
     class = c("mod_out_connection", class(dataset)),
     file_format = file_format,
+    checks = exclude_invalid_files,
     file_system = class(dataset$filesystem)[1],
     model_output_dir = model_output_dir
   )
@@ -64,11 +76,21 @@ connect_model_output.default <- function(model_output_dir,
 connect_model_output.SubTreeFileSystem <- function(model_output_dir,
                                                    file_format = c("csv", "parquet", "arrow"),
                                                    partition_names = "model_id",
-                                                   schema = NULL) {
+                                                   schema = NULL,
+                                                   skip_checks = FALSE) {
   rlang::check_required(model_output_dir)
+
   file_format <- rlang::arg_match(file_format)
   # Only keep file formats of which files actually exist in model_output_dir.
-  file_format <- check_file_format(model_output_dir, file_format, error = TRUE)
+  file_format <- check_file_format(model_output_dir, file_format, skip_checks, error = TRUE)
+
+  # Based on skip_checks param, set a flag that determines whether or not to
+  # check for invalid files when opening model output data.
+  if (isTRUE(skip_checks)) {
+    exclude_invalid_files <- FALSE
+  } else {
+    exclude_invalid_files <- TRUE
+  }
 
   if (file_format == "csv") {
     dataset <- arrow::open_dataset(
@@ -78,7 +100,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
       schema = schema,
       unify_schemas = TRUE,
       strings_can_be_null = TRUE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     )
   } else {
     dataset <- arrow::open_dataset(
@@ -87,7 +109,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
       partitioning = partition_names,
       schema = schema,
       unify_schemas = TRUE,
-      factory_options = list(exclude_invalid_files = TRUE)
+      factory_options = list(exclude_invalid_files = exclude_invalid_files)
     )
   }
 
@@ -99,6 +121,7 @@ connect_model_output.SubTreeFileSystem <- function(model_output_dir,
   structure(dataset,
     class = c("mod_out_connection", class(dataset)),
     file_format = file_format,
+    checks = exclude_invalid_files,
     file_system = class(dataset$filesystem$base_fs)[1],
     model_output_dir = model_output_dir$base_path
   )

diff --git a/R/print.R b/R/print.R
@@ -34,6 +34,11 @@ print.hub_connection <- function(x, verbose = FALSE, ...) {
       "*" = "file_format: {.val {print_file_format_meta(x)}}"
     )
   }
+  if (!is.null(attr(x, "checks"))) {
+    print_msg <- c(print_msg,
+      "*" = "checks: {.val {attr(x, 'checks')}}"
+    )
+  }
   if (!is.null(attr(x, "file_system"))) {
     print_msg <- c(print_msg,
       "*" = "file_system: {.val {attr(x, 'file_system')[1]}}"
@@ -79,6 +84,11 @@ print.mod_out_connection <- function(x, verbose = FALSE, ...) {
       "*" = "file_format: {.val {print_file_format_meta(x)}}"
     )
   }
+  if (!is.null(attr(x, "checks"))) {
+    print_msg <- c(print_msg,
+      "*" = "checks: {.val {attr(x, 'checks')}}"
+    )
+  }
   if (!is.null(attr(x, "file_system"))) {
     print_msg <- c(print_msg,
       "*" = "file_system: {.val {attr(x, 'file_system')}}"

diff --git a/R/utils-connect_hub.R b/R/utils-connect_hub.R
@@ -102,7 +102,7 @@ get_file_format_meta <- function(dataset, model_output_dir, file_format) {
   rbind(n_open, n_in_dir)
 }
 
-check_file_format <- function(model_output_dir, file_format,
+check_file_format <- function(model_output_dir, file_format, skip_checks,
                               call = rlang::caller_env(), error = FALSE) {
   dir_file_formats <- get_dir_file_formats(model_output_dir)
   valid_file_format <- file_format[file_format %in% dir_file_formats]
@@ -121,6 +121,13 @@ check_file_format <- function(model_output_dir, file_format,
       call = call
     )
   }
+  if (length(dir_file_formats) > 1L && isTRUE(skip_checks)) {
+    cli::cli_abort("Skip_checks cannot be TRUE when there
+                   are multiple file formats in the model output directory
+                   ({.val {dir_file_formats}}).",
+      call = call
+    )
+  }
   valid_file_format
 }
 

diff --git a/man/connect_hub.Rd b/man/connect_hub.Rd