tidymodels · juliasilge · Mar 18, 2021 · Mar 10, 2021 · Mar 10, 2021 · Mar 10, 2021
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -42,5 +42,5 @@ License: MIT + file LICENSE
 Encoding: UTF-8
 VignetteBuilder: knitr
 LazyData: true
-RoxygenNote: 7.1.1.9000
+RoxygenNote: 7.1.1.9001
 Roxygen: list(markdown = TRUE)
diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,8 @@
 
 * Attempts to stratify on a `Surv` object now error more informatively (#230). 
 
+* Exposed `pool` argument from `make_strata()` in user-facing resampling functions (#229).
+
 
 # rsample 0.0.9
 

diff --git a/R/boot.R b/R/boot.R
@@ -14,15 +14,14 @@
 #'  package were the bootstrap samples are conducted *within the stratification
 #'  variable*. This can help ensure that the number of data points in the
 #'  bootstrap sample is equivalent to the proportions in the original data set.
-#'  (Strata below 10% of the total are pooled together.)
+#'  (Strata below 10% of the total are pooled together by default.)
 #' @inheritParams vfold_cv
+#' @inheritParams make_strata
 #' @param times The number of bootstrap samples.
 #' @param strata A variable that is used to conduct stratified sampling. When
 #'  not `NULL`, each bootstrap sample is created within the stratification
 #'  variable. This could be a single character value or a variable name that
 #'  corresponds to a variable that exists in the data frame.
-#' @param breaks A single number giving the number of bins desired to stratify
-#'  a numeric stratification variable.
 #' @param apparent A logical. Should an extra resample be added where the
 #'  analysis and holdout subset are the entire data set. This is required for
 #'  some estimators used by the `summary` function that require the apparent
@@ -48,15 +47,15 @@
 #'         })
 #'
 #' set.seed(13)
-#' resample2 <- bootstraps(wa_churn, strata = "churn", times = 3)
+#' resample2 <- bootstraps(wa_churn, strata = churn, times = 3)
 #' map_dbl(resample2$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
 #'           mean(dat == "Yes")
 #'         })
 #'
 #' set.seed(13)
-#' resample3 <- bootstraps(wa_churn, strata = "tenure", breaks = 6, times = 3)
+#' resample3 <- bootstraps(wa_churn, strata = tenure, breaks = 6, times = 3)
 #' map_dbl(resample3$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
@@ -68,6 +67,7 @@ bootstraps <-
            times = 25,
            strata = NULL,
            breaks = 4,
+           pool = 0.1,
            apparent = FALSE,
            ...) {
 
@@ -83,7 +83,8 @@ bootstraps <-
       data = data,
       times = times,
       strata = strata,
-      breaks = breaks
+      breaks = breaks,
+      pool = pool
     )
   if(apparent)
     split_objs <- bind_rows(split_objs, apparent(data))
@@ -108,7 +109,8 @@ boot_splits <-
   function(data,
            times = 25,
            strata = NULL,
-           breaks = 4) {
+           breaks = 4,
+           pool = 0.1) {
 
   n <- nrow(data)
 
@@ -117,7 +119,8 @@ boot_splits <-
   } else {
     stratas <- tibble::tibble(idx = 1:n,
                               strata = make_strata(getElement(data, strata),
-                                                   breaks = breaks))
+                                                   breaks = breaks,
+                                                   pool = pool))
     stratas <- split_unnamed(stratas, stratas$strata)
     stratas <-
       purrr::map_df(

diff --git a/R/initial_split.R b/R/initial_split.R
@@ -9,12 +9,11 @@
 #'  of data points in the training data is equivalent to the proportions in the
 #'  original data set. (Strata below 10% of the total are pooled together.)
 #' @inheritParams vfold_cv
+#' @inheritParams make_strata
 #' @param prop The proportion of data to be retained for modeling/analysis.
 #' @param strata A variable that is used to conduct stratified sampling to
 #'  create the resamples. This could be a single character value or a variable
 #'  name that corresponds to a variable that exists in the data frame.
-#' @param breaks A single number giving the number of bins desired to stratify
-#'  a numeric stratification variable.
 #' @export
 #' @return An `rsplit` object that can be used with the `training` and `testing`
 #'  functions to extract the data in each split.
@@ -38,7 +37,8 @@
 #'
 #' @export
 #'
-initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) {
+initial_split <- function(data, prop = 3/4,
+                          strata = NULL, breaks = 4, pool = 0.1, ...) {
 
   if (!missing(strata)) {
     strata <- tidyselect::vars_select(names(data), !!enquo(strata))
@@ -53,6 +53,7 @@ initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) {
       prop = prop,
       strata = strata,
       breaks = breaks,
+      pool = pool,
       times = 1,
       ...
     )

diff --git a/R/make_strata.R b/R/make_strata.R
@@ -25,7 +25,9 @@
 #' @param nunique An integer for the number of unique value threshold in the
 #'   algorithm.
 #' @param pool A proportion of data used to determine if a particular group is
-#'   too small and should be pooled into another group.
+#'   too small and should be pooled into another group. We do not recommend
+#'   decreasing this argument below its default of 0.1 because of the dangers
+#'   of stratifying groups that are too small.
 #' @param depth An integer that is used to determine the best number of
 #'   percentiles that should be used. The number of bins are based on
 #'   `min(5, floor(n / depth))` where `n = length(x)`.
@@ -67,6 +69,8 @@
 #' table(make_strata(x6, breaks = 10))
 #' @export
 make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) {
+
+  default_pool <- 0.1
   num_vals <- unique(x)
   n <- length(x)
   num_miss <- sum(is.na(x))
@@ -77,11 +81,18 @@ make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) {
 
     ## This should really be based on some combo of rate and number.
     if (all(pcts < pool)) {
-      warning("Too little data to stratify. Unstratified resampling ",
-              "will be used.",
-              call. = FALSE)
+      rlang::warn("Too little data to stratify. Unstratified resampling ",
+                  "will be used.")
       return(factor(rep("strata1", n)))
     }
+
+    if (pool < default_pool & any(pcts < default_pool))
+      rlang::warn(
+        paste0("Stratifying groups that make up ",
+               round(100 * pool), "% of the data may be ",
+               "statistically risky.\nConsider increasing `pool` to at least 0.1")
+      )
+
     ## Small groups will be randomly allocated to stratas at end
     ## These should probably go into adjacent groups but this works for now
     if (any(pcts < pool))
@@ -90,16 +101,16 @@ make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) {
     out <- factor(as.character(x))
   } else {
     if (floor(n / breaks) < depth) {
-      warning(paste0("The number of observations in each quantile is ",
-              "below the recommended threshold of ", depth, ". Stratification ",
-              "will be done with ", floor(n/depth), " breaks instead."),
-              call. = FALSE)
+      rlang::warn(
+        paste0("The number of observations in each quantile is ",
+               "below the recommended threshold of ", depth, ". Stratification ",
+               "will be done with ", floor(n/depth), " breaks instead.")
+      )
     }
     breaks <- min(breaks, floor(n/depth))
     if (breaks < 2) {
-      warning("Too little data to stratify. Unstratified resampling ",
-              "will be used.",
-              call. = FALSE)
+      rlang::warn("Too little data to stratify. Unstratified resampling ",
+                  "will be used.")
       return(factor(rep("strata1", n)))
     }
     pctls <- quantile(x, probs = (0:breaks) / breaks)

diff --git a/R/mc.R b/R/mc.R
@@ -6,15 +6,15 @@
 #' @details The `strata` argument causes the random sampling to be conducted
 #'  *within the stratification variable*. This can help ensure that the number of
 #'  data points in the analysis data is equivalent to the proportions in the
-#'  original data set. (Strata below 10% of the total are pooled together.)
+#'  original data set. (Strata below 10% of the total are pooled together
+#'  by default.)
 #' @inheritParams vfold_cv
+#' @inheritParams make_strata
 #' @param prop The proportion of data to be retained for modeling/analysis.
 #' @param times The number of times to repeat the sampling.
 #' @param strata A variable that is used to conduct stratified sampling to
 #'  create the resamples. This could be a single character value or a variable
 #'  name that corresponds to a variable that exists in the data frame.
-#' @param breaks A single number giving the number of bins desired to stratify
-#'  a numeric stratification variable.
 #' @export
 #' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and
 #'  `data.frame`. The results include a column for the data split objects and a
@@ -35,22 +35,23 @@
 #'         })
 #'
 #' set.seed(13)
-#' resample2 <- mc_cv(wa_churn, strata = "churn", times = 3, prop = .5)
+#' resample2 <- mc_cv(wa_churn, strata = churn, times = 3, prop = .5)
 #' map_dbl(resample2$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
 #'           mean(dat == "Yes")
 #'         })
 #'
 #' set.seed(13)
-#' resample3 <- mc_cv(wa_churn, strata = "tenure", breaks = 6, times = 3, prop = .5)
+#' resample3 <- mc_cv(wa_churn, strata = tenure, breaks = 6, times = 3, prop = .5)
 #' map_dbl(resample3$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
 #'           mean(dat == "Yes")
 #'         })
 #' @export
-mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) {
+mc_cv <- function(data, prop = 3/4, times = 25,
+                  strata = NULL, breaks = 4, pool = 0.1, ...) {
 
   if(!missing(strata)) {
     strata <- tidyselect::vars_select(names(data), !!enquo(strata))
@@ -64,7 +65,8 @@ mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...)
               prop = prop,
               times = times,
               strata = strata,
-              breaks = breaks)
+              breaks = breaks,
+              pool = pool)
 
   ## We remove the holdout indices since it will save space and we can
   ## derive them later when they are needed.
@@ -88,7 +90,9 @@ mc_complement <- function(ind, n) {
 }
 
 
-mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) {
+mc_splits <- function(data, prop = 3/4, times = 25,
+                      strata = NULL, breaks = 4, pool = 0.1) {
+
   if (!is.numeric(prop) | prop >= 1 | prop <= 0)
     stop("`prop` must be a number on (0, 1).", call. = FALSE)
 
@@ -98,7 +102,8 @@ mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) {
   } else {
     stratas <- tibble::tibble(idx = 1:n,
                               strata = make_strata(getElement(data, strata),
-                                                   breaks = breaks))
+                                                   breaks = breaks,
+                                                   pool = pool))
     stratas <- split_unnamed(stratas, stratas$strata)
     stratas <-
       purrr::map_df(stratas, strat_sample, prop = prop, times = times)

diff --git a/R/validation_split.R b/R/validation_split.R
@@ -8,12 +8,11 @@
 #'  data points in the analysis data is equivalent to the proportions in the
 #'  original data set. (Strata below 10% of the total are pooled together.)
 #' @inheritParams vfold_cv
+#' @inheritParams make_strata
 #' @param prop The proportion of data to be retained for modeling/analysis.
 #' @param strata A variable that is used to conduct stratified sampling to
 #'  create the resamples. This could be a single character value or a variable
 #'  name that corresponds to a variable that exists in the data frame.
-#' @param breaks A single number giving the number of bins desired to stratify
-#'  a numeric stratification variable.
 #' @export
 #' @return An tibble with classes `validation_split`, `rset`, `tbl_df`, `tbl`,
 #'  and `data.frame`. The results include a column for the data split objects
@@ -22,7 +21,8 @@
 #' @examples
 #' validation_split(mtcars, prop = .9)
 #' @export
-validation_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) {
+validation_split <- function(data, prop = 3/4,
+                             strata = NULL, breaks = 4, pool = 0.1, ...) {
 
   if (!missing(strata)) {
     strata <- tidyselect::vars_select(names(data), !!enquo(strata))
@@ -38,7 +38,8 @@ validation_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) {
               prop = prop,
               times = 1,
               strata = strata,
-              breaks = breaks)
+              breaks = breaks,
+              pool = pool)
 
   ## We remove the holdout indices since it will save space and we can
   ## derive them later when they are needed.

diff --git a/R/vfold.R b/R/vfold.R
@@ -9,19 +9,18 @@
 #' The `strata` argument causes the random sampling to be conducted *within
 #'  the stratification variable*. This can help ensure that the number of data
 #'  points in the analysis data is equivalent to the proportions in the original
-#'  data set. (Strata below 10% of the total are pooled together.)
+#'  data set. (Strata below 10% of the total are pooled together by default.)
 #' When more than one repeat is requested, the basic V-fold cross-validation
 #'  is conducted each time. For example, if three repeats are used with `v =
 #'  10`, there are a total of 30 splits which as three groups of 10 that are
 #'  generated separately.
+#' @inheritParams make_strata
 #' @param data A data frame.
 #' @param v The number of partitions of the data set.
 #' @param repeats The number of times to repeat the V-fold partitioning.
 #' @param strata A variable that is used to conduct stratified sampling to
 #'  create the folds. This could be a single character value or a variable name
 #'  that corresponds to a variable that exists in the data frame.
-#' @param breaks A single number giving the number of bins desired to stratify
-#'  a numeric stratification variable.
 #' @param ... Not currently used.
 #' @export
 #' @return A tibble with classes `vfold_cv`, `rset`, `tbl_df`, `tbl`, and
@@ -47,22 +46,23 @@
 #'         })
 #'
 #' set.seed(13)
-#' folds2 <- vfold_cv(wa_churn, strata = "churn", v = 5)
+#' folds2 <- vfold_cv(wa_churn, strata = churn, v = 5)
 #' map_dbl(folds2$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
 #'           mean(dat == "Yes")
 #'         })
 #'
 #' set.seed(13)
-#' folds3 <- vfold_cv(wa_churn, strata = "tenure", breaks = 6, v = 5)
+#' folds3 <- vfold_cv(wa_churn, strata = tenure, breaks = 6, v = 5)
 #' map_dbl(folds3$splits,
 #'         function(x) {
 #'           dat <- as.data.frame(x)$churn
 #'           mean(dat == "Yes")
 #'         })
 #' @export
-vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) {
+vfold_cv <- function(data, v = 10, repeats = 1,
+                     strata = NULL, breaks = 4, pool = 0.1, ...) {
 
   if(!missing(strata)) {
     strata <- tidyselect::vars_select(names(data), !!enquo(strata))
@@ -72,10 +72,11 @@ vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...)
   strata_check(strata, data)
 
   if (repeats == 1) {
-    split_objs <- vfold_splits(data = data, v = v, strata = strata, breaks = breaks)
+    split_objs <- vfold_splits(data = data, v = v,
+                               strata = strata, breaks = breaks, pool = pool)
   } else {
     for (i in 1:repeats) {
-      tmp <- vfold_splits(data = data, v = v, strata = strata)
+      tmp <- vfold_splits(data = data, v = v, strata = strata, pool = pool)
       tmp$id2 <- tmp$id
       tmp$id <- names0(repeats, "Repeat")[i]
       split_objs <- if (i == 1)
@@ -101,7 +102,7 @@ vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...)
 }
 
 
-vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4) {
+vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4, pool = 0.1) {
   if (!is.numeric(v) || length(v) != 1)
     stop("`v` must be a single integer.", call. = FALSE)
 
@@ -113,7 +114,8 @@ vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4) {
   } else {
     stratas <- tibble::tibble(idx = 1:n,
                               strata = make_strata(getElement(data, strata),
-                                                   breaks = breaks))
+                                                   breaks = breaks,
+                                                   pool = pool))
     stratas <- split_unnamed(stratas, stratas$strata)
     stratas <- purrr::map(stratas, add_vfolds, v = v)
     stratas <- dplyr::bind_rows(stratas)