tidymodels · juliasilge · Jun 29, 2022 · Jun 28, 2022 · Jun 28, 2022 · Jun 28, 2022
diff --git a/NAMESPACE b/NAMESPACE
@@ -230,7 +230,9 @@ export(ends_with)
 export(everything)
 export(form_pred)
 export(gather)
+export(group_initial_split)
 export(group_mc_cv)
+export(group_validation_split)
 export(group_vfold_cv)
 export(initial_split)
 export(initial_time_split)

diff --git a/NEWS.md b/NEWS.md
@@ -2,7 +2,7 @@
 
 * Added arguments to control how `group_vfold_cv()` combines groups. Use `balance = "groups"` to assign (roughly) the same number of groups to each fold, or `balance = "observations"` to assign (roughly) the same number of observations to each fold.
 
-* Added a new function, `group_mc_cv()`, which performs grouped Monte-Carlo resampling.
+* Added new functions for grouped resampling: `group_mc_cv()` (#313), `group_initial_split()` and `group_validation_split()` (#315).
 
 # rsample 1.0.0
 

diff --git a/R/initial_split.R b/R/initial_split.R
@@ -3,6 +3,9 @@
 #' `initial_split` creates a single binary split of the data into a training
 #'  set and testing set. `initial_time_split` does the same, but takes the
 #'  _first_ `prop` samples for training, instead of a random selection.
+#'  `group_initial_split` creates splits of the data based
+#'  on some grouping variable, so that all data in a "group" is assigned to
+#'  the same split.
 #'  `training` and `testing` are used to extract the resulting data.
 #' @template strata_details
 #' @inheritParams vfold_cv
@@ -28,6 +31,12 @@
 #' train_data <- training(drinks_lag_split)
 #' test_data <- testing(drinks_lag_split)
 #' c(max(train_data$date), min(test_data$date)) # 12 period lag
+#'
+#' set.seed(1353)
+#' car_split <- group_initial_split(mtcars, cyl)
+#' train_data <- training(car_split)
+#' test_data <- testing(car_split)
+#'
 #' @export
 #'
 initial_split <- function(data, prop = 3 / 4,
@@ -85,3 +94,23 @@ training <- function(x) analysis(x)
 #' @rdname initial_split
 #' @export
 testing <- function(x) assessment(x)
+
+#' @inheritParams make_groups
+#' @rdname initial_split
+#' @export
+group_initial_split <- function(data, group, prop = 3 / 4, ...) {
+
+  res <-
+    group_mc_cv(
+      data = data,
+      group = {{ group }},
+      prop = prop,
+      times = 1,
+      ...
+    )
+  res <- res$splits[[1]]
+  class(res) <- c("initial_split", class(res))
+  res
+
+}
+
diff --git a/R/make_groups.R b/R/make_groups.R
@@ -100,21 +100,9 @@ balance_observations <- function(data_ind, v, ...) {
     most_improved <- which.min(group_breakdown$improvement)
     freq_table[next_row, ]$assignment <- group_breakdown[most_improved, ]$assignment
   }
-  data_ind <- dplyr::left_join(data_ind, freq_table, by = c("..group" = "key"))
-  data_ind$..group <- data_ind$assignment
-  data_ind <- data_ind[c("..index", "..group")]
-
-  unique_groups <- unique(data_ind$..group)
 
-  keys <- data.frame(
-    ..group = unique_groups,
-    ..folds = sample(rep(seq_len(v), length.out = length(unique_groups)))
-  )
+  collapse_groups(freq_table, data_ind, v)
 
-  list(
-    data_ind = data_ind,
-    keys = keys
-  )
 }
 
 balance_prop <- function(prop, data_ind, v, ...) {
@@ -141,10 +129,23 @@ balance_prop <- function(prop, data_ind, v, ...) {
     }
   )
 
+  collapse_groups(freq_table, data_ind, v)
+
+}
+
+collapse_groups <- function(freq_table, data_ind, v) {
   data_ind <- dplyr::left_join(data_ind, freq_table, by = c("..group" = "key"))
   data_ind$..group <- data_ind$assignment
   data_ind <- data_ind[c("..index", "..group")]
 
+  # If a group was never assigned a fold, then its `..group` is NA
+  #
+  # If we leave that alone, it winds up messing up our fold assignments,
+  # because it will be assigned some value in `seq_len(v)`
+  #
+  # So instead, we drop those groups here:
+  data_ind <- stats::na.omit(data_ind)
+
   unique_groups <- unique(data_ind$..group)
 
   keys <- data.frame(

diff --git a/R/validation_split.R b/R/validation_split.R
@@ -5,6 +5,9 @@
 #'  added to the assessment set (to be used as the validation set).
 #'  `validation_time_split()` does the same, but takes the _first_ `prop` samples
 #'  for training, instead of a random selection.
+#'  `group_validation_split()` creates splits of the data based
+#'  on some grouping variable, so that all data in a "group" is assigned to
+#'  the same split.
 #' @template strata_details
 #' @inheritParams vfold_cv
 #' @inheritParams make_strata
@@ -19,6 +22,8 @@
 #'
 #' data(drinks, package = "modeldata")
 #' validation_time_split(drinks)
+#'
+#' group_validation_split(mtcars, cyl)
 #' @export
 validation_split <- function(data, prop = 3 / 4,
                              strata = NULL, breaks = 4, pool = 0.1, ...) {
@@ -93,3 +98,41 @@ validation_time_split <- function(data, prop = 3 / 4, lag = 0, ...) {
     subclass = c("validation_split", "rset")
   )
 }
+
+#' @rdname validation_split
+#' @inheritParams group_initial_split
+#' @export
+group_validation_split <- function(data, group, prop = 3 / 4, ...) {
+
+  rlang::check_dots_empty()
+
+  group <- validate_group({{ group }}, data)
+
+  split_objs <-
+    group_mc_splits(
+      data = data,
+      group = {{ group }},
+      prop = prop,
+      times = 1
+    )
+
+  ## We remove the holdout indices since it will save space and we can
+  ## derive them later when they are needed.
+
+  split_objs$splits <- map(split_objs$splits, rm_out)
+  class(split_objs$splits[[1]]) <- c("val_split", "rsplit")
+
+  val_att <- list(
+    prop = prop,
+    group = group,
+    strata = FALSE
+  )
+
+  new_rset(
+    splits = split_objs$splits,
+    ids = "validation",
+    attrib = val_att,
+    subclass = c("validation_split", "rset")
+  )
+}
+
diff --git a/man/initial_split.Rd b/man/initial_split.Rd
diff --git a/man/validation_split.Rd b/man/validation_split.Rd
diff --git a/tests/testthat/helpers-rsample.R b/tests/testthat/helpers-rsample.R
@@ -1,4 +1,4 @@
-dat1 <- data.frame(a = 1:20, b = letters[1:20])
+dat1 <- data.frame(a = 1:20, b = letters[1:20], c = rep(1:4, 5))
 car_folds <- vfold_cv(mtcars)
 
 new_rng_snapshots <- utils::compareVersion("3.6.0", as.character(getRversion())) > 0

diff --git a/tests/testthat/test-initial.R b/tests/testthat/test-initial.R
@@ -42,6 +42,15 @@ test_that("default time param with lag", {
   )
 })
 
+test_that("default group param", {
+  rs1 <- group_initial_split(dat1, c)
+  expect_equal(class(rs1), c("initial_split", "grouped_mc_split", "rsplit"))
+  tr1 <- training(rs1)
+  ts1 <- testing(rs1)
+  expect_equal(nrow(tr1), nrow(dat1) * 3 / 4)
+  expect_equal(nrow(ts1), nrow(dat1) / 4)
+})
+
 test_that("`prop` computes the proportion for analysis (#217)", {
   set.seed(11)
 
@@ -64,6 +73,15 @@ test_that("`prop` computes the proportion for analysis (#217)", {
   }
 })
 
+test_that("`prop` computes the proportion for group analysis", {
+  rs1 <- group_initial_split(dat1, c, prop = 1 / 2)
+  expect_equal(class(rs1), c("initial_split", "grouped_mc_split", "rsplit"))
+  tr1 <- training(rs1)
+  ts1 <- testing(rs1)
+  expect_equal(nrow(tr1), nrow(dat1) * 1 / 2)
+  expect_equal(nrow(ts1), nrow(dat1) / 2)
+  expect_equal(nrow(tr1), nrow(ts1))
+})
 
 test_that("printing initial split objects", {
   expect_snapshot(initial_split(mtcars))

diff --git a/tests/testthat/test-mc.R b/tests/testthat/test-mc.R
@@ -144,6 +144,29 @@ test_that("grouping - tibble input", {
 
 })
 
+test_that("grouping with times = 1 works", {
+  set.seed(11)
+  rs3 <- group_mc_cv(warpbreaks, "tension", times = 1)
+  sizes3 <- dim_rset(rs3)
+
+  expect_true(all(sizes3$analysis == 36))
+  expect_true(all(sizes3$assessment == 18))
+  same_data <-
+    purrr::map_lgl(rs3$splits, function(x) {
+      all.equal(x$data, warpbreaks)
+    })
+  expect_true(all(same_data))
+
+  good_holdout <- purrr::map_lgl(
+    rs3$splits,
+    function(x) {
+      length(intersect(x$in_ind, x$out_id)) == 0
+    }
+  )
+  expect_true(all(good_holdout))
+
+})
+
 test_that("grouping - printing", {
   expect_snapshot(group_mc_cv(warpbreaks, "tension"))
 })

diff --git a/tests/testthat/test-validation.R b/tests/testthat/test-validation.R
@@ -59,6 +59,27 @@ test_that("default time param with lag", {
   expect_snapshot(validation_time_split(drinks, lag = 500), error = TRUE)
 })
 
+test_that("default group param", {
+  set.seed(11)
+  rs1 <- group_validation_split(dat1, c)
+  sizes1 <- dim_rset(rs1)
+
+  expect_true(all(sizes1$analysis == 15))
+  expect_true(all(sizes1$assessment == 5))
+  same_data <-
+    purrr::map_lgl(rs1$splits, function(x) {
+      all.equal(x$data, dat1)
+    })
+  expect_true(all(same_data))
+
+  good_holdout <- purrr::map_lgl(
+    rs1$splits,
+    function(x) {
+      length(intersect(x$in_ind, x$out_id)) == 0
+    }
+  )
+  expect_true(all(good_holdout))
+})
 
 test_that("different percent", {
   set.seed(11)
@@ -80,6 +101,27 @@ test_that("different percent", {
     }
   )
   expect_true(all(good_holdout))
+
+  set.seed(11)
+  rs2_group <- group_validation_split(dat1, c, prop = .5)
+  sizes2_group <- dim_rset(rs2_group)
+
+  expect_true(all(sizes2_group$analysis == 10))
+  expect_true(all(sizes2_group$assessment == 10))
+  same_data <-
+    purrr::map_lgl(rs2_group$splits, function(x) {
+      all.equal(x$data, dat1)
+    })
+  expect_true(all(same_data))
+
+  good_holdout <- purrr::map_lgl(
+    rs2_group$splits,
+    function(x) {
+      length(intersect(x$in_ind, x$out_id)) == 0
+    }
+  )
+  expect_true(all(good_holdout))
+
 })
 
 test_that("strata", {