insightsengineering · edelarua · Jun 8, 2023 · May 16, 2023 · May 16, 2023 · May 17, 2023
diff --git a/NEWS.md b/NEWS.md
@@ -25,6 +25,8 @@
 * Consolidated all KM plot documentation within the `g_km` function.
 * Added `a_count_patients_sum_exposure` for `summarize_patients_exposure_in_cols` and new analyze function `analyze_patients_exposure_in_cols`.
 * Added more informative error when the user selects an invalid method for unstratified analyses in `s_proportion_diff`.
+* Updated `s_summary` and `s_compare` to allow `NA` values in input variables. For factor variables with `NA`s, if `na.rm = FALSE` an explicit `NA` level will be automatically added. `na.rm = TRUE` will also consider `"<Missing>"` values and excldue them.
+* Updated purpose of `na_level` parameter in `s_summary` and `s_compare` to align with other `tern` functions: Previously used as string to consider as `NA` when setting `na.rm = TRUE`. Now used as string to print in place of `NA` values in output table.
 
 ### Miscellaneous
 * Implemented the `lubridate` package for date variables in `tern` datasets.

diff --git a/R/argument_convention.R b/R/argument_convention.R
@@ -34,7 +34,7 @@
 #'   for more information.
 #' @param lyt (`layout`)\cr input layout where analyses will be added to.
 #' @param na.rm (`flag`)\cr whether `NA` values should be removed from `x` prior to analysis.
-#' @param na_level (`string`)\cr used to replace all `NA` or empty values in factors with custom `string`.
+#' @param na_level (`string`)\cr string used to replace all `NA` or empty values in the output.
 #' @param nested (`flag`)\cr whether this layout instruction be applied within the existing layout structure _if
 #'   possible_ (`TRUE`, the default) or as a new top-level element (`FALSE`). Ignored if it would nest a split
 #'   underneath analyses, which is not allowed.

diff --git a/R/compare_variables.R b/R/compare_variables.R
@@ -11,13 +11,15 @@
 #'   between columns, therefore a row-based proportion would not make sense. Proportion based on `N_col` would
 #'   be difficult since we use counts for the chi-squared test statistic, therefore missing values should be accounted
 #'   for as explicit factor levels.
+#' * If factor variables contain `NA`, these `NA` values are excluded by default. To include `NA` values
+#'   set `na.rm = FALSE` and missing values will be displayed as an `NA` level. Alternatively, an explicit
+#'   factor level can be defined for `NA` values during pre-processing via [df_explicit_na()] - the
+#'   default `na_level` (`"<Missing>"`) will also be excluded when `na.rm` is set to `TRUE`.
 #' * For character variables, automatic conversion to factor does not guarantee that the table
 #'   will be generated correctly. In particular for sparse tables this very likely can fail.
 #'   Therefore it is always better to manually convert character variables to factors during pre-processing.
 #' * For `compare_vars()`, the column split must define a reference group via `ref_group` so that the comparison
 #'   is well defined.
-#' * When factor variables contains `NA`, it is expected that `NA` values have been conveyed to `na_level`
-#'   appropriately beforehand via [df_explicit_na()].
 #'
 #' @seealso Relevant constructor function [create_afun_compare()], and [s_summary()] which is used internally
 #'   to compute a summary within `s_compare()`.
@@ -104,24 +106,25 @@ s_compare.factor <- function(x,
                              .in_ref_col,
                              denom = "n",
                              na.rm = TRUE, # nolint
-                             na_level = "<Missing>",
                              ...) {
   checkmate::assert_flag(.in_ref_col)
-  assert_valid_factor(x, any.missing = FALSE)
-  assert_valid_factor(.ref_group, any.missing = FALSE)
+  assert_valid_factor(x)
+  assert_valid_factor(.ref_group)
   denom <- match.arg(denom)
 
   y <- s_summary.factor(
     x = x,
     denom = denom,
     na.rm = na.rm,
-    na_level = na_level,
     ...
   )
 
   if (na.rm) {
-    x <- fct_discard(x, na_level)
-    .ref_group <- fct_discard(.ref_group, na_level)
+    x <- x[!is.na(x)] %>% fct_discard("<Missing>")
+    .ref_group <- .ref_group[!is.na(.ref_group)] %>% fct_discard("<Missing>")
+  } else {
+    x <- x %>% explicit_na(label = "NA")
+    .ref_group <- .ref_group %>% explicit_na(label = "NA")
   }
 
   checkmate::assert_factor(x, levels = levels(.ref_group), min.levels = 2)
@@ -171,19 +174,17 @@ s_compare.character <- function(x,
                                 .in_ref_col,
                                 denom = "n",
                                 na.rm = TRUE, # nolint
-                                na_level = "<Missing>",
                                 .var,
                                 verbose = TRUE,
                                 ...) {
-  x <- as_factor_keep_attributes(x, x_name = .var, na_level = na_level, verbose = verbose)
-  .ref_group <- as_factor_keep_attributes(.ref_group, x_name = .var, na_level = na_level, verbose = verbose)
+  x <- as_factor_keep_attributes(x, x_name = .var, verbose = verbose)
+  .ref_group <- as_factor_keep_attributes(.ref_group, x_name = .var, verbose = verbose)
   s_compare(
     x = x,
     .ref_group = .ref_group,
     .in_ref_col = .in_ref_col,
     denom = denom,
     na.rm = na.rm,
-    na_level = na_level,
     ...
   )
 }
@@ -511,6 +512,7 @@ compare_vars <- function(lyt,
                          var_labels = vars,
                          nested = TRUE,
                          ...,
+                         na_level = NA_character_,
                          show_labels = "default",
                          table_names = vars,
                          .stats = c("n", "mean_sd", "count_fraction", "pval"),
@@ -526,6 +528,7 @@ compare_vars <- function(lyt,
     afun = afun,
     nested = nested,
     extra_args = list(...),
+    na_str = na_level,
     inclNAs = TRUE,
     show_labels = show_labels,
     table_names = table_names

diff --git a/R/summarize_variables.R b/R/summarize_variables.R
@@ -125,7 +125,6 @@ s_summary <- function(x,
                       denom,
                       .N_row, # nolint
                       .N_col, # nolint
-                      na_level,
                       .var,
                       ...) {
   checkmate::assert_flag(na.rm)
@@ -216,7 +215,6 @@ s_summary.numeric <- function(x,
                               denom,
                               .N_row, # nolint
                               .N_col, # nolint
-                              na_level,
                               .var,
                               control = control_summarize_vars(),
                               ...) {
@@ -315,8 +313,10 @@ s_summary.numeric <- function(x,
 #' @note
 #' * If `x` is an empty `factor`, a list is still returned for `counts` with one element
 #'   per factor level. If there are no levels in `x`, the function fails.
-#' * If `x` contains `NA`, it is expected that `NA` have been conveyed to `na_level`
-#'   appropriately beforehand with [df_explicit_na()] or [explicit_na()].
+#' * If factor variables contain `NA`, these `NA` values are excluded by default. To include `NA` values
+#'   set `na.rm = FALSE` and missing values will be displayed as an `NA` level. Alternatively, an explicit
+#'   factor level can be defined for `NA` values during pre-processing via [df_explicit_na()] - the
+#'   default `na_level` (`"<Missing>"`) will also be excluded when `na.rm` is set to `TRUE`.
 #'
 #' @method s_summary factor
 #'
@@ -345,12 +345,15 @@ s_summary.factor <- function(x,
                              denom = c("n", "N_row", "N_col"),
                              .N_row, # nolint
                              .N_col, # nolint
-                             na_level = "<Missing>",
                              ...) {
-  assert_valid_factor(x, any.missing = FALSE)
+  assert_valid_factor(x)
   denom <- match.arg(denom)
 
-  if (na.rm) x <- fct_discard(x, na_level)
+  if (na.rm) {
+    x <- x[!is.na(x)] %>% fct_discard("<Missing>")
+  } else {
+    x <- x %>% explicit_na(label = "NA")
+  }
 
   y <- list()
 
@@ -401,15 +404,13 @@ s_summary.character <- function(x,
                                 denom = c("n", "N_row", "N_col"),
                                 .N_row, # nolint
                                 .N_col, # nolint
-                                na_level = "<Missing>",
                                 .var,
                                 verbose = TRUE,
                                 ...) {
-  y <- as_factor_keep_attributes(x, x_name = .var, na_level = na_level, verbose = verbose)
+  y <- as_factor_keep_attributes(x, x_name = .var, verbose = verbose)
   s_summary(
     x = y,
     na.rm = na.rm,
-    na_level = na_level,
     denom = denom,
     .N_row = .N_row,
     .N_col = .N_col,
@@ -731,6 +732,7 @@ summarize_vars <- function(lyt,
                            var_labels = vars,
                            nested = TRUE,
                            ...,
+                           na_level = NA_character_,
                            show_labels = "default",
                            table_names = vars,
                            section_div = NA_character_,
@@ -747,6 +749,7 @@ summarize_vars <- function(lyt,
     afun = afun,
     nested = nested,
     extra_args = list(...),
+    na_str = na_level,
     inclNAs = TRUE,
     show_labels = show_labels,
     table_names = table_names,

diff --git a/man/analyze_vars_in_cols.Rd b/man/analyze_vars_in_cols.Rd
diff --git a/man/argument_convention.Rd b/man/argument_convention.Rd
diff --git a/man/compare_variables.Rd b/man/compare_variables.Rd
diff --git a/man/h_map_for_count_abnormal.Rd b/man/h_map_for_count_abnormal.Rd
diff --git a/man/h_stack_by_baskets.Rd b/man/h_stack_by_baskets.Rd