-
Notifications
You must be signed in to change notification settings - Fork 66
/
make_strata.R
129 lines (123 loc) · 4.52 KB
/
make_strata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#' Create or Modify Stratification Variables
#'
#' This function can create strata from numeric data and make non-numeric data
#' more conducive for stratification.
#'
#' @details
#' For numeric data, if the number of unique levels is less than
#' `nunique`, the data are treated as categorical data.
#'
#' For categorical inputs, the function will find levels of `x` than
#' occur in the data with percentage less than `pool`. The values from
#' these groups will be randomly assigned to the remaining strata (as will
#' data points that have missing values in `x`).
#'
#' For numeric data with more unique values than `nunique`, the data
#' will be converted to being categorical based on percentiles of the data.
#' The percentile groups will have no more than 20 percent of the data in
#' each group. Again, missing values in `x` are randomly assigned
#' to groups.
#'
#' @param x An input vector.
#' @param breaks A single number giving the number of bins desired to stratify a
#' numeric stratification variable.
#' @param nunique An integer for the number of unique value threshold in the
#' algorithm.
#' @param pool A proportion of data used to determine if a particular group is
#' too small and should be pooled into another group. We do not recommend
#' decreasing this argument below its default of 0.1 because of the dangers
#' of stratifying groups that are too small.
#' @param depth An integer that is used to determine the best number of
#' percentiles that should be used. The number of bins are based on
#' `min(5, floor(n / depth))` where `n = length(x)`.
#' If `x` is numeric, there must be at least 40 rows in the data set
#' (when `depth = 20`) to conduct stratified sampling.
#'
#' @export
#' @return A factor vector.
#' @examples
#' set.seed(61)
#' x1 <- rpois(100, lambda = 5)
#' table(x1)
#' table(make_strata(x1))
#'
#' set.seed(554)
#' x2 <- rpois(100, lambda = 1)
#' table(x2)
#' table(make_strata(x2))
#'
#' # small groups are randomly assigned
#' x3 <- factor(x2)
#' table(x3)
#' table(make_strata(x3))
#'
#' # `oilType` data from `caret`
#' x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2))
#' table(x4)
#' table(make_strata(x4))
#' table(make_strata(x4, pool = 0.1))
#' table(make_strata(x4, pool = 0.0))
#'
#' # not enough data to stratify
#' x5 <- rnorm(20)
#' table(make_strata(x5))
#'
#' set.seed(483)
#' x6 <- rnorm(200)
#' quantile(x6, probs = (0:10)/10)
#' table(make_strata(x6, breaks = 10))
#' @export
make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) {
default_pool <- 0.1
num_vals <- unique(stats::na.omit(x))
n <- length(x)
if (length(num_vals) <= nunique | is.character(x) | is.factor(x)) {
x <- factor(x)
xtab <- sort(table(x))
pcts <- xtab / n
## This should really be based on some combo of rate and number.
if (all(pcts < pool)) {
rlang::warn(c("Too little data to stratify.",
"Resampling will be unstratified."))
return(factor(rep("strata1", n)))
}
if (pool < default_pool & any(pcts < default_pool))
rlang::warn(c(
paste0("Stratifying groups that make up ",
round(100 * pool), "% of the data may be ",
"statistically risky."),
"Consider increasing `pool` to at least 0.1"
))
## Small groups will be randomly allocated to stratas at end
## These should probably go into adjacent groups but this works for now
if (any(pcts < pool))
x[x %in% names(pcts)[pcts < pool]] <- NA
## The next line will also relevel the data if `x` was a factor
out <- factor(as.character(x))
} else {
if (breaks < 2) {
rlang::warn(c("The bins specified by `breaks` must be >=2.",
"Resampling will be unstratified."))
return(factor(rep("strata1", n)))
} else if (floor(n / breaks) < depth) {
rlang::warn(c(
paste0("The number of observations in each quantile is ",
"below the recommended threshold of ", depth, "."),
paste0("Stratification will use ", floor(n/depth), " breaks instead.")
))
}
breaks <- min(breaks, floor(n/depth))
if (breaks < 2) {
rlang::warn(c("Too little data to stratify.",
"Resampling will be unstratified."))
return(factor(rep("strata1", n)))
}
pctls <- quantile(x, probs = (0:breaks) / breaks, na.rm = TRUE)
pctls <- unique(pctls)
out <- cut(x, breaks = pctls, include.lowest = TRUE)
}
num_miss <- sum(is.na(x))
if (num_miss > 0)
out[is.na(x)] <- sample(levels(out), size = num_miss, replace = TRUE)
out
}