diff --git a/DESCRIPTION b/DESCRIPTION index 64b73a50..424930db 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,5 +42,5 @@ License: MIT + file LICENSE Encoding: UTF-8 VignetteBuilder: knitr LazyData: true -RoxygenNote: 7.1.1.9000 +RoxygenNote: 7.1.1.9001 Roxygen: list(markdown = TRUE) diff --git a/NEWS.md b/NEWS.md index c635622b..2c0f891b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -8,6 +8,8 @@ * Attempts to stratify on a `Surv` object now error more informatively (#230). +* Exposed `pool` argument from `make_strata()` in user-facing resampling functions (#229). + # rsample 0.0.9 diff --git a/R/boot.R b/R/boot.R index 5ca742dc..d2481481 100644 --- a/R/boot.R +++ b/R/boot.R @@ -14,15 +14,14 @@ #' package were the bootstrap samples are conducted *within the stratification #' variable*. This can help ensure that the number of data points in the #' bootstrap sample is equivalent to the proportions in the original data set. -#' (Strata below 10% of the total are pooled together.) +#' (Strata below 10% of the total are pooled together by default.) #' @inheritParams vfold_cv +#' @inheritParams make_strata #' @param times The number of bootstrap samples. #' @param strata A variable that is used to conduct stratified sampling. When #' not `NULL`, each bootstrap sample is created within the stratification #' variable. This could be a single character value or a variable name that #' corresponds to a variable that exists in the data frame. -#' @param breaks A single number giving the number of bins desired to stratify -#' a numeric stratification variable. #' @param apparent A logical. Should an extra resample be added where the #' analysis and holdout subset are the entire data set. This is required for #' some estimators used by the `summary` function that require the apparent @@ -48,7 +47,7 @@ #' }) #' #' set.seed(13) -#' resample2 <- bootstraps(wa_churn, strata = "churn", times = 3) +#' resample2 <- bootstraps(wa_churn, strata = churn, times = 3) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn @@ -56,7 +55,7 @@ #' }) #' #' set.seed(13) -#' resample3 <- bootstraps(wa_churn, strata = "tenure", breaks = 6, times = 3) +#' resample3 <- bootstraps(wa_churn, strata = tenure, breaks = 6, times = 3) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn @@ -68,6 +67,7 @@ bootstraps <- times = 25, strata = NULL, breaks = 4, + pool = 0.1, apparent = FALSE, ...) { @@ -83,7 +83,8 @@ bootstraps <- data = data, times = times, strata = strata, - breaks = breaks + breaks = breaks, + pool = pool ) if(apparent) split_objs <- bind_rows(split_objs, apparent(data)) @@ -108,7 +109,8 @@ boot_splits <- function(data, times = 25, strata = NULL, - breaks = 4) { + breaks = 4, + pool = 0.1) { n <- nrow(data) @@ -117,7 +119,8 @@ boot_splits <- } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), - breaks = breaks)) + breaks = breaks, + pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map_df( diff --git a/R/initial_split.R b/R/initial_split.R index 499aba26..a1afcb86 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -9,12 +9,11 @@ #' of data points in the training data is equivalent to the proportions in the #' original data set. (Strata below 10% of the total are pooled together.) #' @inheritParams vfold_cv +#' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @param strata A variable that is used to conduct stratified sampling to #' create the resamples. This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. -#' @param breaks A single number giving the number of bins desired to stratify -#' a numeric stratification variable. #' @export #' @return An `rsplit` object that can be used with the `training` and `testing` #' functions to extract the data in each split. @@ -38,7 +37,8 @@ #' #' @export #' -initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) { +initial_split <- function(data, prop = 3/4, + strata = NULL, breaks = 4, pool = 0.1, ...) { if (!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) @@ -53,6 +53,7 @@ initial_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) { prop = prop, strata = strata, breaks = breaks, + pool = pool, times = 1, ... ) diff --git a/R/make_strata.R b/R/make_strata.R index f10c2450..79339e31 100644 --- a/R/make_strata.R +++ b/R/make_strata.R @@ -25,7 +25,9 @@ #' @param nunique An integer for the number of unique value threshold in the #' algorithm. #' @param pool A proportion of data used to determine if a particular group is -#' too small and should be pooled into another group. +#' too small and should be pooled into another group. We do not recommend +#' decreasing this argument below its default of 0.1 because of the dangers +#' of stratifying groups that are too small. #' @param depth An integer that is used to determine the best number of #' percentiles that should be used. The number of bins are based on #' `min(5, floor(n / depth))` where `n = length(x)`. @@ -67,6 +69,8 @@ #' table(make_strata(x6, breaks = 10)) #' @export make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) { + + default_pool <- 0.1 num_vals <- unique(x) n <- length(x) num_miss <- sum(is.na(x)) @@ -77,11 +81,18 @@ make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) { ## This should really be based on some combo of rate and number. if (all(pcts < pool)) { - warning("Too little data to stratify. Unstratified resampling ", - "will be used.", - call. = FALSE) + rlang::warn("Too little data to stratify. Unstratified resampling ", + "will be used.") return(factor(rep("strata1", n))) } + + if (pool < default_pool & any(pcts < default_pool)) + rlang::warn( + paste0("Stratifying groups that make up ", + round(100 * pool), "% of the data may be ", + "statistically risky.\nConsider increasing `pool` to at least 0.1") + ) + ## Small groups will be randomly allocated to stratas at end ## These should probably go into adjacent groups but this works for now if (any(pcts < pool)) @@ -90,16 +101,16 @@ make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) { out <- factor(as.character(x)) } else { if (floor(n / breaks) < depth) { - warning(paste0("The number of observations in each quantile is ", - "below the recommended threshold of ", depth, ". Stratification ", - "will be done with ", floor(n/depth), " breaks instead."), - call. = FALSE) + rlang::warn( + paste0("The number of observations in each quantile is ", + "below the recommended threshold of ", depth, ". Stratification ", + "will be done with ", floor(n/depth), " breaks instead.") + ) } breaks <- min(breaks, floor(n/depth)) if (breaks < 2) { - warning("Too little data to stratify. Unstratified resampling ", - "will be used.", - call. = FALSE) + rlang::warn("Too little data to stratify. Unstratified resampling ", + "will be used.") return(factor(rep("strata1", n))) } pctls <- quantile(x, probs = (0:breaks) / breaks) diff --git a/R/mc.R b/R/mc.R index e1b74ae8..0c3a2f8f 100644 --- a/R/mc.R +++ b/R/mc.R @@ -6,15 +6,15 @@ #' @details The `strata` argument causes the random sampling to be conducted #' *within the stratification variable*. This can help ensure that the number of #' data points in the analysis data is equivalent to the proportions in the -#' original data set. (Strata below 10% of the total are pooled together.) +#' original data set. (Strata below 10% of the total are pooled together +#' by default.) #' @inheritParams vfold_cv +#' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @param times The number of times to repeat the sampling. #' @param strata A variable that is used to conduct stratified sampling to #' create the resamples. This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. -#' @param breaks A single number giving the number of bins desired to stratify -#' a numeric stratification variable. #' @export #' @return An tibble with classes `mc_cv`, `rset`, `tbl_df`, `tbl`, and #' `data.frame`. The results include a column for the data split objects and a @@ -35,7 +35,7 @@ #' }) #' #' set.seed(13) -#' resample2 <- mc_cv(wa_churn, strata = "churn", times = 3, prop = .5) +#' resample2 <- mc_cv(wa_churn, strata = churn, times = 3, prop = .5) #' map_dbl(resample2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn @@ -43,14 +43,15 @@ #' }) #' #' set.seed(13) -#' resample3 <- mc_cv(wa_churn, strata = "tenure", breaks = 6, times = 3, prop = .5) +#' resample3 <- mc_cv(wa_churn, strata = tenure, breaks = 6, times = 3, prop = .5) #' map_dbl(resample3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' @export -mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) { +mc_cv <- function(data, prop = 3/4, times = 25, + strata = NULL, breaks = 4, pool = 0.1, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) @@ -64,7 +65,8 @@ mc_cv <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) prop = prop, times = times, strata = strata, - breaks = breaks) + breaks = breaks, + pool = pool) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. @@ -88,7 +90,9 @@ mc_complement <- function(ind, n) { } -mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) { +mc_splits <- function(data, prop = 3/4, times = 25, + strata = NULL, breaks = 4, pool = 0.1) { + if (!is.numeric(prop) | prop >= 1 | prop <= 0) stop("`prop` must be a number on (0, 1).", call. = FALSE) @@ -98,7 +102,8 @@ mc_splits <- function(data, prop = 3/4, times = 25, strata = NULL, breaks = 4) { } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), - breaks = breaks)) + breaks = breaks, + pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map_df(stratas, strat_sample, prop = prop, times = times) diff --git a/R/validation_split.R b/R/validation_split.R index 2ad13046..b43f2692 100644 --- a/R/validation_split.R +++ b/R/validation_split.R @@ -8,12 +8,11 @@ #' data points in the analysis data is equivalent to the proportions in the #' original data set. (Strata below 10% of the total are pooled together.) #' @inheritParams vfold_cv +#' @inheritParams make_strata #' @param prop The proportion of data to be retained for modeling/analysis. #' @param strata A variable that is used to conduct stratified sampling to #' create the resamples. This could be a single character value or a variable #' name that corresponds to a variable that exists in the data frame. -#' @param breaks A single number giving the number of bins desired to stratify -#' a numeric stratification variable. #' @export #' @return An tibble with classes `validation_split`, `rset`, `tbl_df`, `tbl`, #' and `data.frame`. The results include a column for the data split objects @@ -22,7 +21,8 @@ #' @examples #' validation_split(mtcars, prop = .9) #' @export -validation_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) { +validation_split <- function(data, prop = 3/4, + strata = NULL, breaks = 4, pool = 0.1, ...) { if (!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) @@ -38,7 +38,8 @@ validation_split <- function(data, prop = 3/4, strata = NULL, breaks = 4, ...) { prop = prop, times = 1, strata = strata, - breaks = breaks) + breaks = breaks, + pool = pool) ## We remove the holdout indices since it will save space and we can ## derive them later when they are needed. diff --git a/R/vfold.R b/R/vfold.R index db38dcc5..a82965fb 100644 --- a/R/vfold.R +++ b/R/vfold.R @@ -9,19 +9,18 @@ #' The `strata` argument causes the random sampling to be conducted *within #' the stratification variable*. This can help ensure that the number of data #' points in the analysis data is equivalent to the proportions in the original -#' data set. (Strata below 10% of the total are pooled together.) +#' data set. (Strata below 10% of the total are pooled together by default.) #' When more than one repeat is requested, the basic V-fold cross-validation #' is conducted each time. For example, if three repeats are used with `v = #' 10`, there are a total of 30 splits which as three groups of 10 that are #' generated separately. +#' @inheritParams make_strata #' @param data A data frame. #' @param v The number of partitions of the data set. #' @param repeats The number of times to repeat the V-fold partitioning. #' @param strata A variable that is used to conduct stratified sampling to #' create the folds. This could be a single character value or a variable name #' that corresponds to a variable that exists in the data frame. -#' @param breaks A single number giving the number of bins desired to stratify -#' a numeric stratification variable. #' @param ... Not currently used. #' @export #' @return A tibble with classes `vfold_cv`, `rset`, `tbl_df`, `tbl`, and @@ -47,7 +46,7 @@ #' }) #' #' set.seed(13) -#' folds2 <- vfold_cv(wa_churn, strata = "churn", v = 5) +#' folds2 <- vfold_cv(wa_churn, strata = churn, v = 5) #' map_dbl(folds2$splits, #' function(x) { #' dat <- as.data.frame(x)$churn @@ -55,14 +54,15 @@ #' }) #' #' set.seed(13) -#' folds3 <- vfold_cv(wa_churn, strata = "tenure", breaks = 6, v = 5) +#' folds3 <- vfold_cv(wa_churn, strata = tenure, breaks = 6, v = 5) #' map_dbl(folds3$splits, #' function(x) { #' dat <- as.data.frame(x)$churn #' mean(dat == "Yes") #' }) #' @export -vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) { +vfold_cv <- function(data, v = 10, repeats = 1, + strata = NULL, breaks = 4, pool = 0.1, ...) { if(!missing(strata)) { strata <- tidyselect::vars_select(names(data), !!enquo(strata)) @@ -72,10 +72,11 @@ vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) strata_check(strata, data) if (repeats == 1) { - split_objs <- vfold_splits(data = data, v = v, strata = strata, breaks = breaks) + split_objs <- vfold_splits(data = data, v = v, + strata = strata, breaks = breaks, pool = pool) } else { for (i in 1:repeats) { - tmp <- vfold_splits(data = data, v = v, strata = strata) + tmp <- vfold_splits(data = data, v = v, strata = strata, pool = pool) tmp$id2 <- tmp$id tmp$id <- names0(repeats, "Repeat")[i] split_objs <- if (i == 1) @@ -101,7 +102,7 @@ vfold_cv <- function(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) } -vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4) { +vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4, pool = 0.1) { if (!is.numeric(v) || length(v) != 1) stop("`v` must be a single integer.", call. = FALSE) @@ -113,7 +114,8 @@ vfold_splits <- function(data, v = 10, strata = NULL, breaks = 4) { } else { stratas <- tibble::tibble(idx = 1:n, strata = make_strata(getElement(data, strata), - breaks = breaks)) + breaks = breaks, + pool = pool)) stratas <- split_unnamed(stratas, stratas$strata) stratas <- purrr::map(stratas, add_vfolds, v = v) stratas <- dplyr::bind_rows(stratas) diff --git a/man/bootstraps.Rd b/man/bootstraps.Rd index da05a4ba..3a199c70 100644 --- a/man/bootstraps.Rd +++ b/man/bootstraps.Rd @@ -4,7 +4,15 @@ \alias{bootstraps} \title{Bootstrap Sampling} \usage{ -bootstraps(data, times = 25, strata = NULL, breaks = 4, apparent = FALSE, ...) +bootstraps( + data, + times = 25, + strata = NULL, + breaks = 4, + pool = 0.1, + apparent = FALSE, + ... +) } \arguments{ \item{data}{A data frame.} @@ -16,8 +24,13 @@ not \code{NULL}, each bootstrap sample is created within the stratification variable. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} -\item{breaks}{A single number giving the number of bins desired to stratify -a numeric stratification variable.} +\item{breaks}{A single number giving the number of bins desired to stratify a +numeric stratification variable.} + +\item{pool}{A proportion of data used to determine if a particular group is +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{apparent}{A logical. Should an extra resample be added where the analysis and holdout subset are the entire data set. This is required for @@ -48,7 +61,7 @@ The \code{strata} argument is based on a similar argument in the random forest package were the bootstrap samples are conducted \emph{within the stratification variable}. This can help ensure that the number of data points in the bootstrap sample is equivalent to the proportions in the original data set. -(Strata below 10\% of the total are pooled together.) +(Strata below 10\% of the total are pooled together by default.) } \examples{ bootstraps(mtcars, times = 2) @@ -67,7 +80,7 @@ map_dbl(resample1$splits, }) set.seed(13) -resample2 <- bootstraps(wa_churn, strata = "churn", times = 3) +resample2 <- bootstraps(wa_churn, strata = churn, times = 3) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$churn @@ -75,7 +88,7 @@ map_dbl(resample2$splits, }) set.seed(13) -resample3 <- bootstraps(wa_churn, strata = "tenure", breaks = 6, times = 3) +resample3 <- bootstraps(wa_churn, strata = tenure, breaks = 6, times = 3) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$churn diff --git a/man/initial_split.Rd b/man/initial_split.Rd index aeb22b51..9355983f 100644 --- a/man/initial_split.Rd +++ b/man/initial_split.Rd @@ -7,7 +7,7 @@ \alias{testing} \title{Simple Training/Test Set Splitting} \usage{ -initial_split(data, prop = 3/4, strata = NULL, breaks = 4, ...) +initial_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) initial_time_split(data, prop = 3/4, lag = 0, ...) @@ -24,8 +24,13 @@ testing(x) create the resamples. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} -\item{breaks}{A single number giving the number of bins desired to stratify -a numeric stratification variable.} +\item{breaks}{A single number giving the number of bins desired to stratify a +numeric stratification variable.} + +\item{pool}{A proportion of data used to determine if a particular group is +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{...}{Not currently used.} diff --git a/man/make_strata.Rd b/man/make_strata.Rd index ece03b51..779daa6e 100644 --- a/man/make_strata.Rd +++ b/man/make_strata.Rd @@ -16,7 +16,9 @@ numeric stratification variable.} algorithm.} \item{pool}{A proportion of data used to determine if a particular group is -too small and should be pooled into another group.} +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{depth}{An integer that is used to determine the best number of percentiles that should be used. The number of bins are based on diff --git a/man/mc_cv.Rd b/man/mc_cv.Rd index 65a0f04b..f5c54a52 100644 --- a/man/mc_cv.Rd +++ b/man/mc_cv.Rd @@ -4,7 +4,7 @@ \alias{mc_cv} \title{Monte Carlo Cross-Validation} \usage{ -mc_cv(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) +mc_cv(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, pool = 0.1, ...) } \arguments{ \item{data}{A data frame.} @@ -17,8 +17,13 @@ mc_cv(data, prop = 3/4, times = 25, strata = NULL, breaks = 4, ...) create the resamples. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} -\item{breaks}{A single number giving the number of bins desired to stratify -a numeric stratification variable.} +\item{breaks}{A single number giving the number of bins desired to stratify a +numeric stratification variable.} + +\item{pool}{A proportion of data used to determine if a particular group is +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{...}{Not currently used.} } @@ -36,7 +41,8 @@ data points are added to the assessment set. The \code{strata} argument causes the random sampling to be conducted \emph{within the stratification variable}. This can help ensure that the number of data points in the analysis data is equivalent to the proportions in the -original data set. (Strata below 10\% of the total are pooled together.) +original data set. (Strata below 10\% of the total are pooled together +by default.) } \examples{ mc_cv(mtcars, times = 2) @@ -54,7 +60,7 @@ map_dbl(resample1$splits, }) set.seed(13) -resample2 <- mc_cv(wa_churn, strata = "churn", times = 3, prop = .5) +resample2 <- mc_cv(wa_churn, strata = churn, times = 3, prop = .5) map_dbl(resample2$splits, function(x) { dat <- as.data.frame(x)$churn @@ -62,7 +68,7 @@ map_dbl(resample2$splits, }) set.seed(13) -resample3 <- mc_cv(wa_churn, strata = "tenure", breaks = 6, times = 3, prop = .5) +resample3 <- mc_cv(wa_churn, strata = tenure, breaks = 6, times = 3, prop = .5) map_dbl(resample3$splits, function(x) { dat <- as.data.frame(x)$churn diff --git a/man/validation_split.Rd b/man/validation_split.Rd index 5323e63b..80d0f79d 100644 --- a/man/validation_split.Rd +++ b/man/validation_split.Rd @@ -4,7 +4,7 @@ \alias{validation_split} \title{Create a Validation Set} \usage{ -validation_split(data, prop = 3/4, strata = NULL, breaks = 4, ...) +validation_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) } \arguments{ \item{data}{A data frame.} @@ -15,8 +15,13 @@ validation_split(data, prop = 3/4, strata = NULL, breaks = 4, ...) create the resamples. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} -\item{breaks}{A single number giving the number of bins desired to stratify -a numeric stratification variable.} +\item{breaks}{A single number giving the number of bins desired to stratify a +numeric stratification variable.} + +\item{pool}{A proportion of data used to determine if a particular group is +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{...}{Not currently used.} } diff --git a/man/vfold_cv.Rd b/man/vfold_cv.Rd index b978fd0e..6f9d12ee 100644 --- a/man/vfold_cv.Rd +++ b/man/vfold_cv.Rd @@ -4,7 +4,7 @@ \alias{vfold_cv} \title{V-Fold Cross-Validation} \usage{ -vfold_cv(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) +vfold_cv(data, v = 10, repeats = 1, strata = NULL, breaks = 4, pool = 0.1, ...) } \arguments{ \item{data}{A data frame.} @@ -17,8 +17,13 @@ vfold_cv(data, v = 10, repeats = 1, strata = NULL, breaks = 4, ...) create the folds. This could be a single character value or a variable name that corresponds to a variable that exists in the data frame.} -\item{breaks}{A single number giving the number of bins desired to stratify -a numeric stratification variable.} +\item{breaks}{A single number giving the number of bins desired to stratify a +numeric stratification variable.} + +\item{pool}{A proportion of data used to determine if a particular group is +too small and should be pooled into another group. We do not recommend +decreasing this argument below its default of 0.1 because of the dangers +of stratifying groups that are too small.} \item{...}{Not currently used.} } @@ -41,7 +46,7 @@ to V. The \code{strata} argument causes the random sampling to be conducted \emph{within the stratification variable}. This can help ensure that the number of data points in the analysis data is equivalent to the proportions in the original -data set. (Strata below 10\% of the total are pooled together.) +data set. (Strata below 10\% of the total are pooled together by default.) When more than one repeat is requested, the basic V-fold cross-validation is conducted each time. For example, if three repeats are used with \code{v = 10}, there are a total of 30 splits which as three groups of 10 that are generated separately. @@ -62,7 +67,7 @@ map_dbl(folds1$splits, }) set.seed(13) -folds2 <- vfold_cv(wa_churn, strata = "churn", v = 5) +folds2 <- vfold_cv(wa_churn, strata = churn, v = 5) map_dbl(folds2$splits, function(x) { dat <- as.data.frame(x)$churn @@ -70,7 +75,7 @@ map_dbl(folds2$splits, }) set.seed(13) -folds3 <- vfold_cv(wa_churn, strata = "tenure", breaks = 6, v = 5) +folds3 <- vfold_cv(wa_churn, strata = tenure, breaks = 6, v = 5) map_dbl(folds3$splits, function(x) { dat <- as.data.frame(x)$churn diff --git a/tests/testthat/test_strata.R b/tests/testthat/test_strata.R index 9801d319..6ee88826 100644 --- a/tests/testthat/test_strata.R +++ b/tests/testthat/test_strata.R @@ -17,14 +17,20 @@ test_that('simple numerics', { }) test_that('simple character', { - x2 <- factor(rep(LETTERS[1:5], each = 50)) - str2a <- make_strata(x2) + x2 <- factor(rep(LETTERS[1:12], each = 20)) + expect_warning( + str2a <- make_strata(x2, pool = 0.05), + "Stratifying groups that make up 5%" + ) expect_equal(table(str2a, dnn = ""), table(x2, dnn = "")) + }) test_that('bad data', { x3 <- factor(rep(LETTERS[1:15], each = 50)) - expect_warning(make_strata(x3)) + expect_warning(make_strata(x3), "Too little data") + expect_warning(make_strata(x3, pool = 0.06), + "Stratifying groups that make up 6%") expect_warning(make_strata(mtcars$mpg)) }) diff --git a/tests/testthat/test_vfold.R b/tests/testthat/test_vfold.R index c614b654..236eaf96 100644 --- a/tests/testthat/test_vfold.R +++ b/tests/testthat/test_vfold.R @@ -66,6 +66,12 @@ test_that('strata', { length(intersect(x$in_ind, x$out_id)) == 0 }) expect_true(all(good_holdout)) + + expect_warning( + rs4 <- vfold_cv(mlc_churn, strata = state, pool = 0.01), + "Stratifying groups that make up 1%" + ) + })