-
Notifications
You must be signed in to change notification settings - Fork 66
/
make_strata.R
141 lines (136 loc) · 4.59 KB
/
make_strata.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#' Create or Modify Stratification Variables
#'
#' This function can create strata from numeric data and make non-numeric data
#' more conducive for stratification.
#'
#' @details
#' For numeric data, if the number of unique levels is less than
#' `nunique`, the data are treated as categorical data.
#'
#' For categorical inputs, the function will find levels of `x` than
#' occur in the data with percentage less than `pool`. The values from
#' these groups will be randomly assigned to the remaining strata (as will
#' data points that have missing values in `x`).
#'
#' For numeric data with more unique values than `nunique`, the data
#' will be converted to being categorical based on percentiles of the data.
#' The percentile groups will have no more than 20 percent of the data in
#' each group. Again, missing values in `x` are randomly assigned
#' to groups.
#'
#' @param x An input vector.
#' @param breaks A single number giving the number of bins desired to stratify a
#' numeric stratification variable.
#' @param nunique An integer for the number of unique value threshold in the
#' algorithm.
#' @param pool A proportion of data used to determine if a particular group is
#' too small and should be pooled into another group. We do not recommend
#' decreasing this argument below its default of 0.1 because of the dangers
#' of stratifying groups that are too small.
#' @param depth An integer that is used to determine the best number of
#' percentiles that should be used. The number of bins are based on
#' `min(5, floor(n / depth))` where `n = length(x)`.
#' If `x` is numeric, there must be at least 40 rows in the data set
#' (when `depth = 20`) to conduct stratified sampling.
#'
#' @export
#' @return A factor vector.
#' @examples
#' set.seed(61)
#' x1 <- rpois(100, lambda = 5)
#' table(x1)
#' table(make_strata(x1))
#'
#' set.seed(554)
#' x2 <- rpois(100, lambda = 1)
#' table(x2)
#' table(make_strata(x2))
#'
#' # small groups are randomly assigned
#' x3 <- factor(x2)
#' table(x3)
#' table(make_strata(x3))
#'
#' # `oilType` data from `caret`
#' x4 <- rep(LETTERS[1:7], c(37, 26, 3, 7, 11, 10, 2))
#' table(x4)
#' table(make_strata(x4))
#' table(make_strata(x4, pool = 0.1))
#' table(make_strata(x4, pool = 0.0))
#'
#' # not enough data to stratify
#' x5 <- rnorm(20)
#' table(make_strata(x5))
#'
#' set.seed(483)
#' x6 <- rnorm(200)
#' quantile(x6, probs = (0:10) / 10)
#' table(make_strata(x6, breaks = 10))
#' @export
make_strata <- function(x, breaks = 4, nunique = 5, pool = .1, depth = 20) {
default_pool <- 0.1
num_vals <- unique(stats::na.omit(x))
n <- length(x)
if (length(num_vals) <= nunique | is.character(x) | is.factor(x)) {
x <- factor(x)
xtab <- sort(table(x))
pcts <- xtab / n
## This should really be based on some combo of rate and number.
if (all(pcts < pool)) {
rlang::warn(c(
"Too little data to stratify.",
"Resampling will be unstratified."
))
return(factor(rep("strata1", n)))
}
if (pool < default_pool & any(pcts < default_pool)) {
rlang::warn(c(
paste0(
"Stratifying groups that make up ",
round(100 * pool), "% of the data may be ",
"statistically risky."
),
"Consider increasing `pool` to at least 0.1"
))
}
## Small groups will be randomly allocated to stratas at end
## These should probably go into adjacent groups but this works for now
if (any(pcts < pool)) {
x[x %in% names(pcts)[pcts < pool]] <- NA
}
## The next line will also relevel the data if `x` was a factor
out <- factor(as.character(x))
} else {
if (breaks < 2) {
rlang::warn(c(
"The bins specified by `breaks` must be >=2.",
"Resampling will be unstratified."
))
return(factor(rep("strata1", n)))
} else if (floor(n / breaks) < depth) {
rlang::warn(c(
paste0(
"The number of observations in each quantile is ",
"below the recommended threshold of ", depth, "."
),
paste0("Stratification will use ", floor(n / depth), " breaks instead.")
))
}
breaks <- min(breaks, floor(n / depth))
if (breaks < 2) {
rlang::warn(c(
"Too little data to stratify.",
"Resampling will be unstratified."
))
return(factor(rep("strata1", n)))
}
pctls <- quantile(x, probs = (0:breaks) / breaks, na.rm = TRUE)
pctls <- unique(pctls)
out <- cut(x, breaks = pctls, include.lowest = TRUE)
}
num_miss <- sum(is.na(x))
if (num_miss > 0) {
out[is.na(x)] <- sample(levels(out), size = num_miss, replace = TRUE)
}
out
}