Merge 289ff87 into 2dba87e

insightsengineering · Nov 10, 2022 · 0e02c7a · 0e02c7a
2 parents 2dba87e + 289ff87
commit 0e02c7a
Show file tree

Hide file tree

Showing 7 changed files with 131 additions and 31 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -13,6 +13,7 @@
 
 * Updated examples to use `scda.2022`.
 * Added R session information into a link in the footer of `teal` applications.
+* Added data hashing step using `rlang` instead of `digest` package to calculate the hash (which has been moved from `teal.data` and `teal.slice`). There is now an explicit hashing check in the reproducible code output 
 
 # teal 0.12.0
 

diff --git a/R/get_rcode.R b/R/get_rcode.R
@@ -91,7 +91,9 @@ get_rcode <- function(datasets = NULL,
     )
     str_install <- paste(c(get_rcode_str_install(), ""), collapse = "\n")
     str_libs <- paste(get_rcode_libraries(), "\n")
-    str_code <- get_datasets_code(datanames, datasets)
+
+    hashes <- calculate_hashes(datanames, datasets)
+    str_code <- c(get_datasets_code(datanames, datasets, hashes), teal.slice::get_filter_expr(datasets, datanames))
   } else {
     str_header <- get_rcode_header(title = title, description = description)
     str_install <- character(0)
@@ -141,30 +143,21 @@ get_rcode <- function(datasets = NULL,
 #'  - hash of loaded objects
 #'  - filter panel code
 #' @keywords internal
-get_datasets_code <- function(datanames, datasets) {
+get_datasets_code <- function(datanames, datasets, hashes) {
   str_code <- datasets$get_code(datanames)
   if (length(str_code) == 0 || (length(str_code) == 1 && str_code == "")) {
-    str_code <- paste0(c(
-      "#################################################################",
-      "# ___  ____ ____ ___  ____ ____ ____ ____ ____ ____ _ _  _ ____ #",
-      "# |__] |__/ |___ |__] |__/ |  | |    |___ [__  [__  | |\\ | | __ #",
-      "# |    |  \\ |___ |    |  \\ |__| |___ |___ ___] ___] | | \\| |__] #",
-      "#              _ ____    ____ _  _ ___  ___ _   _               #",
-      "#              | [__     |___ |\\/| |__]  |   \\_/                #",
-      "#              | ___]    |___ |  | |     |    |                 #",
-      "#################################################################\n"
-    ), collapse = "\n")
+    str_code <- "message('Preprocessing is empty')"
   } else if (length(str_code) > 0) {
     str_code <- paste0(str_code, "\n\n")
   }
   if (!datasets$get_check()) {
     check_note_string <- paste0(
       c(
-        "## NOTE: Reproducibility of data import and preprocessing was not",
-        "## explicitly checked (argument \"check = FALSE\" is set).",
-        "## The app developer has the choice to check the reproducibility",
-        "## and might have omitted this step for some reason. Please reach",
-        "## out to the app developer for details.\n"
+        "warning(paste(\"Reproducibility of data import and preprocessing was not\",",
+        "   \"explicitly checked (argument 'check = FALSE' is set).\",",
+        "   \"The app developer has the choice to check the reproducibility\",",
+        "   \"and might have omitted this step for some reason. Please reach\",",
+        "   \"out to the app developer for details.\n\"))"
       ),
       collapse = "\n"
     )
@@ -177,9 +170,9 @@ get_datasets_code <- function(datanames, datasets) {
         datanames,
         function(dataname) {
           sprintf(
-            "# %s MD5 hash at the time of analysis: %s",
-            dataname,
-            datasets$get_filtered_dataset(dataname)$get_hash()
+            "stopifnot(%s == %s)",
+            deparse1(bquote(rlang::hash(.(as.name(dataname))))),
+            deparse1(hashes[[dataname]])
           )
         },
         character(1)
@@ -189,11 +182,7 @@ get_datasets_code <- function(datanames, datasets) {
     "\n\n"
   )
 
-  str_filter <- teal.slice::get_filter_expr(datasets, datanames)
-  if (str_filter != "") {
-    str_filter <- paste0(str_filter, "\n\n")
-  }
-  c(str_code, str_hash, str_filter)
+  c(str_code, str_hash)
 }
 
 ## Module ----

diff --git a/R/module_nested_tabs.R b/R/module_nested_tabs.R
@@ -160,6 +160,7 @@ srv_nested_tabs.teal_modules <- function(id, datasets, modules, reporter) {
         "module { deparse1(modules$label) }."
       )
     )
+
     modules_reactive <- sapply(names(modules$children), USE.NAMES = TRUE, function(id) {
       srv_nested_tabs(id = id, datasets = datasets, modules = modules$children[[id]], reporter = reporter)
     })
@@ -255,13 +256,32 @@ srv_nested_tabs.teal_module <- function(id, datasets, modules, reporter) {
     }
   )
 
+  hashes <- calculate_hashes(datanames, datasets)
   metadata <- lapply(datanames, datasets$get_metadata)
   names(metadata) <- datanames
 
   new_tdata(
     data,
-    reactive(get_datasets_code(datanames, datasets)),
+    reactive(c(get_datasets_code(datanames, datasets, hashes), teal.slice::get_filter_expr(datasets, datanames))),
     datasets$get_join_keys(),
     metadata
   )
 }
+
+#' Get the hash of a dataset
+#'
+#' @param datanames (`character`) names of datasets
+#' @param datasets (`FilteredData`) object holding the data
+#'
+#' @return A list of hashes per dataset
+#' @keywords internal
+#'
+calculate_hashes <- function(datanames, datasets) {
+  sapply(
+    datanames,
+    simplify = FALSE,
+    function(x) {
+      rlang::hash(datasets$get_data(x, filtered = FALSE))
+    }
+  )
+}
diff --git a/man/calculate_hashes.Rd b/man/calculate_hashes.Rd
diff --git a/man/get_datasets_code.Rd b/man/get_datasets_code.Rd
diff --git a/tests/testthat/test-get_rcode.R b/tests/testthat/test-get_rcode.R
@@ -80,15 +80,16 @@ testthat::test_that("get_datasets_code returns code only for specified datanames
     )
   )
 
+  hashes <- calculate_hashes(datasets$datanames(), datasets)
   testthat::expect_true(
     !grepl(
       "mtcars",
-      paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
+      paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
       ignore.case = TRUE
     ) &&
       grepl(
         "iris",
-        paste(get_datasets_code(datasets = datasets, dataname = "IRIS"), collapse = "\n"),
+        paste(get_datasets_code(datasets = datasets, dataname = "IRIS", hashes = hashes), collapse = "\n"),
         ignore.case = TRUE
       )
   )

diff --git a/tests/testthat/test-module_nested_tabs.R b/tests/testthat/test-module_nested_tabs.R
@@ -313,8 +313,15 @@ testthat::test_that(".datasets_to_data returns tdata object", {
 
   # code
   testthat::expect_equal(
-    isolate(get_code(data)[1]),
-    "d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n"
+    isolate(get_code(data)),
+    c(
+      "d1 <- data.frame(id = 1:5, pk = c(2, 3, 2, 1, 4), val = 1:5)\nd2 <- data.frame(id = 1:5, value = 1:5)\n\n",
+      paste0(
+        "stopifnot(rlang::hash(d1) == \"f6f90d2c133ca4abdeb2f7a7d85b731e\")\n",
+        "stopifnot(rlang::hash(d2) == \"6e30be195b7d914a1311672c3ebf4e4f\") \n\n"
+      ),
+      ""
+    )
   )
 
   # metadata
@@ -343,3 +350,65 @@ testthat::test_that(".datasets_to_data returns parent datasets for CDISC data",
   data <- .datasets_to_data(module, datasets)
   testthat::expect_setequal(isolate(names(data)), c("ADSL", "ADAE"))
 })
+
+testthat::test_that("calculate_hashes takes a FilteredData and vector of datanames as input", {
+  adsl <- data.frame(STUDYID = 1, USUBJID = 1)
+  adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
+  adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)
+
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::cdisc_data(
+      teal.data::cdisc_dataset("ADSL", adsl),
+      teal.data::cdisc_dataset("ADAE", adae),
+      teal.data::cdisc_dataset("ADTTE", adtte)
+    )
+  )
+
+  testthat::expect_error(calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets), NA)
+})
+
+testthat::test_that("calculate_hashes returns a named list", {
+  adsl <- data.frame(STUDYID = 1, USUBJID = 1)
+  adae <- data.frame(STUDYID = 1, USUBJID = 1, ASTDTM = 1, AETERM = 1, AESEQ = 1)
+  adtte <- data.frame(STUDYID = 1, USUBJID = 1, PARAMCD = 1)
+
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::cdisc_data(
+      teal.data::cdisc_dataset("ADSL", adsl),
+      teal.data::cdisc_dataset("ADAE", adae),
+      teal.data::cdisc_dataset("ADTTE", adtte)
+    )
+  )
+
+  hashes <- calculate_hashes(datanames = c("ADSL", "ADAE", "ADTTE"), datasets = datasets)
+  testthat::expect_identical(
+    hashes,
+    list(
+      "ADSL" = "e89f5271357822c78dd5cfddb60c0a95",
+      "ADAE" = "f71b576ecfd23075f7285841327515e0",
+      "ADTTE" = "c68c01c86b946a3dfe05150da040aa2a"
+    )
+  )
+  testthat::expect_is(hashes, "list")
+  testthat::expect_named(hashes)
+})
+
+testthat::test_that("calculate_hashes returns the hash of the non Filtered dataset", {
+  datasets <- teal.slice::init_filtered_data(
+    teal.data::teal_data(
+      teal.data::dataset("iris", iris)
+    )
+  )
+
+  fs <- list(
+    iris = list(
+      Sepal.Length = list(c(5.1, 6.4)),
+      Species = c("setosa", "versicolor")
+    )
+  )
+  datasets$set_filter_state(state = fs)
+
+  hashes <- calculate_hashes(datanames = c("iris"), datasets = datasets)
+  testthat::expect_identical(hashes, list("iris" = "34844aba7bde36f5a34f6d8e39803508"))
+  testthat::expect_false(hashes == rlang::hash(isolate(datasets$get_data("iris", filtered = TRUE))))
+})