-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
generics.R
252 lines (223 loc) · 8.8 KB
/
generics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#' Extending dplyr with new data frame subclasses
#'
#' @description
#' `r lifecycle::badge("experimental")`
#'
#' These three functions, along with `names<-` and 1d numeric `[`
#' (i.e. `x[loc]`) methods, provide a minimal interface for extending dplyr
#' to work with new data frame subclasses. This means that for simple cases
#' you should only need to provide a couple of methods, rather than a method
#' for every dplyr verb.
#'
#' These functions are a stop-gap measure until we figure out how to solve
#' the problem more generally, but it's likely that any code you write to
#' implement them will find a home in what comes next.
#'
#' # Basic advice
#'
#' This section gives you basic advice if you want to extend dplyr to work with
#' your custom data frame subclass, and you want the dplyr methods to behave
#' in basically the same way.
#'
#' * If you have data frame attributes that don't depend on the rows or columns
#' (and should unconditionally be preserved), you don't need to do anything.
#'
#' * If you have __scalar__ attributes that depend on __rows__, implement a
#' `dplyr_reconstruct()` method. Your method should recompute the attribute
#' depending on rows now present.
#'
#' * If you have __scalar__ attributes that depend on __columns__, implement a
#' `dplyr_reconstruct()` method and a 1d `[` method. For example, if your
#' class requires that certain columns be present, your method should return
#' a data.frame or tibble when those columns are removed.
#'
#' * If your attributes are __vectorised__ over __rows__, implement a
#' `dplyr_row_slice()` method. This gives you access to `i` so you can
#' modify the row attribute accordingly. You'll also need to think carefully
#' about how to recompute the attribute in `dplyr_reconstruct()`, and
#' you will need to carefully verify the behaviour of each verb, and provide
#' additional methods as needed.
#'
#' * If your attributes that are __vectorised__ over __columns__, implement
#' `dplyr_col_modify()`, 1d `[`, and `names<-` methods. All of these methods
#' know which columns are being modified, so you can update the column
#' attribute according. You'll also need to think carefully about how to
#' recompute the attribute in `dplyr_reconstruct()`, and you will need to
#' carefully verify the behaviour of each verb, and provide additional
#' methods as needed.
#'
#' # Current usage
#'
#' * `arrange()`, `filter()`, `slice()`, `semi_join()`, and `anti_join()`
#' work by generating a vector of row indices, and then subsetting
#' with `dplyr_row_slice()`.
#'
#' * `mutate()` generates a list of new column value (using `NULL` to indicate
#' when columns should be deleted), then passes that to `dplyr_col_modify()`.
#' `transmute()` does the same then uses 1d `[` to select the columns.
#'
#' * `summarise()` works similarly to `mutate()` but the data modified by
#' `dplyr_col_modify()` comes from `group_data()`.
#'
#' * `select()` uses 1d `[` to select columns, then `names<-` to rename them.
#' `rename()` just uses `names<-`. `relocate()` just uses 1d `[`.
#'
#' * `inner_join()`, `left_join()`, `right_join()`, and `full_join()`
#' coerces `x` to a tibble, modify the rows, then uses `dplyr_reconstruct()`
#' to convert back to the same type as `x`.
#'
#' * `nest_join()` uses `dplyr_col_modify()` to cast the key variables to
#' common type and add the nested-df that `y` becomes.
#'
#' * `distinct()` does a `mutate()` if any expressions are present, then
#' uses 1d `[` to select variables to keep, then `dplyr_row_slice()` to
#' select distinct rows.
#'
#' Note that `group_by()` and `ungroup()` don't use any these generics and
#' you'll need to provide methods directly.
#'
#' @keywords internal
#' @param data A tibble. We use tibbles because they avoid some inconsistent
#' subset-assignment use cases
#' @name dplyr_extending
NULL
#' @export
#' @rdname dplyr_extending
#' @param i A numeric or logical vector that indexes the rows of `.data`.
dplyr_row_slice <- function(data, i, ...) {
if (!is.numeric(i) && !is.logical(i)) {
abort("`i` must be a numeric or logical vector.")
}
UseMethod("dplyr_row_slice")
}
#' @export
dplyr_row_slice.data.frame <- function(data, i, ...) {
dplyr_reconstruct(vec_slice(data, i), data)
}
#' @export
dplyr_row_slice.grouped_df <- function(data, i, ..., preserve = FALSE) {
out <- vec_slice(as.data.frame(data), i)
# Index into group_indices, then use that to restore the grouping structure
groups <- group_data(data)
new_id <- vec_slice(group_indices(data), i)
new_grps <- vec_group_loc(new_id)
rows <- rep(list_of(integer()), length.out = nrow(groups))
rows[new_grps$key] <- new_grps$loc
groups$.rows <- rows
if (!preserve && isTRUE(attr(groups, ".drop"))) {
groups <- group_data_trim(groups)
}
new_grouped_df(out, groups)
}
#' @export
dplyr_row_slice.rowwise_df <- function(data, i, ..., preserve = FALSE) {
out <- vec_slice(data, i)
group_data <- vec_slice(group_keys(data), i)
new_rowwise_df(out, group_data)
}
#' @export
#' @rdname dplyr_extending
#' @param cols A named list used modify columns. A `NULL` value should remove
#' an existing column.
dplyr_col_modify <- function(data, cols) {
UseMethod("dplyr_col_modify")
}
#' @export
dplyr_col_modify.data.frame <- function(data, cols) {
# Must be implemented from first principles to avoiding edge cases in
# [.data.frame and [.tibble (2.1.3 and earlier).
# Apply tidyverse recycling rules
cols <- vec_recycle_common(!!!cols, .size = nrow(data))
# Transform to list to avoid stripping inner names with `[[<-`
out <- as.list(dplyr_vec_data(data))
nms <- as_utf8_character(names2(cols))
names(out) <- as_utf8_character(names2(out))
for (i in seq_along(cols)) {
nm <- nms[[i]]
out[[nm]] <- cols[[i]]
}
# Transform back to data frame before reconstruction
row_names <- .row_names_info(data, type = 0L)
out <- new_data_frame(out, n = nrow(data), row.names = row_names)
dplyr_reconstruct(out, data)
}
#' @export
dplyr_col_modify.grouped_df <- function(data, cols) {
out <- dplyr_col_modify(as_tibble(data), cols)
if (any(names(cols) %in% group_vars(data))) {
# regroup
grouped_df(out, group_vars(data), drop = group_by_drop_default(data))
} else {
new_grouped_df(out, group_data(data))
}
}
#' @export
dplyr_col_modify.rowwise_df <- function(data, cols) {
out <- dplyr_col_modify(as_tibble(data), cols)
rowwise_df(out, group_vars(data))
}
#' @param template Template to use for restoring attributes
#' @export
#' @rdname dplyr_extending
dplyr_reconstruct <- function(data, template) {
# Strip attributes before dispatch to make it easier to implement
# methods and prevent unexpected leaking of irrelevant attributes.
data <- dplyr_new_data_frame(data)
return(dplyr_reconstruct_dispatch(data, template))
UseMethod("dplyr_reconstruct", template)
}
dplyr_reconstruct_dispatch <- function(data, template) {
UseMethod("dplyr_reconstruct", template)
}
#' @export
dplyr_reconstruct.data.frame <- function(data, template) {
attrs <- attributes(template)
attrs$names <- names(data)
attrs$row.names <- .row_names_info(data, type = 0L)
attributes(data) <- attrs
data
}
#' @export
dplyr_reconstruct.grouped_df <- function(data, template) {
group_vars <- group_intersect(template, data)
grouped_df(data, group_vars, drop = group_by_drop_default(template))
}
#' @export
dplyr_reconstruct.rowwise_df <- function(data, template) {
group_vars <- group_intersect(template, data)
rowwise_df(data, group_vars)
}
dplyr_col_select <- function(.data, loc, names = NULL, error_call = caller_env()) {
loc <- vec_as_location(loc, n = ncol(.data), names = names(.data))
out <- .data[loc]
if (!inherits(out, "data.frame")) {
classes_data <- glue_collapse(class(.data), sep = "/")
classes_out <- glue_collapse(class(out), sep = "/")
bullets <- c(
"Can't reconstruct data frame.",
x = glue("The `[` method for class <{classes_data}> must return a data frame."),
i = glue("It returned a <{classes_out}>.")
)
abort(bullets, call = error_call)
}
if (length(out) != length(loc)) {
classes_data <- glue_collapse(class(.data), sep = "/")
classes_out <- glue_collapse(class(out), sep = "/")
s <- function(x) if (length(x) == 1) "" else "s"
bullets <- c(
"Can't reconstruct data frame.",
x = glue("The `[` method for class <{classes_data}> must return a data frame with {length(loc)} column{s(loc)}."),
i = glue("It returned a <{classes_out}> of {length(out)} column{s(out)}.")
)
abort(bullets, call = error_call)
}
# Patch base data frames to restore extra attributes that `[.data.frame` drops.
# We require `[` methods to keep extra attributes for all data frame subclasses.
if (identical(class(.data), "data.frame")) {
out <- dplyr_reconstruct(out, .data)
}
if (!is.null(names)) {
names(out) <- names
}
out
}