-
Notifications
You must be signed in to change notification settings - Fork 985
/
setops.R
287 lines (269 loc) · 16 KB
/
setops.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# For internal use only (input symbol requirement is not checked)
# cols [symbol] - columns provided to function argument
# dt [symbol] - a data.table
# Iff all of 'cols' is present in 'x' return col indices
# is.data.table(dt) check should be performed in the calling function
validate <- function(cols, dt) {
argcols = deparse(substitute(cols))
argdt = deparse(substitute(dt))
origcols = cols
if (is.character(cols)) cols = chmatch(cols, names(dt))
cols = as.integer(cols)
isna = which(!cols %in% seq_along(dt))
if (length(isna))
stop(argcols, " value(s) [", paste(origcols[isna], collapse=", "), "] not present (or out of range) in ", argdt)
cols
}
# setdiff for data.tables, internal at the moment #547, used in not-join
setdiff_ <- function(x, y, by.x=seq_along(x), by.y=seq_along(y), use.names=FALSE) {
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must both be data.tables")
if (is.null(x) || !length(x)) return(x)
by.x = validate(by.x, x)
if (is.null(y) || !length(y)) return(unique(x, by=by.x))
by.y = validate(by.y, y)
if (length(by.x) != length(by.y)) stop("length(by.x) != length(by.y)")
# factor in x should've factor/character in y, and viceversa
for (a in seq_along(by.x)) {
lc = by.y[a]
rc = by.x[a]
icnam = names(y)[lc]
xcnam = names(x)[rc]
if ( is.character(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is character, the corresponding column in y ('",icnam,"') should be factor or character, but found incompatible type '",typeof(y[[lc]]),"'.")
} else if ( is.factor(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is factor, the corresponding column in y ('",icnam,"') should be character or factor, but found incompatible type '",typeof(y[[lc]]),"'.")
} else if ( (is.integer(x[[rc]]) || is.double(x[[rc]])) && (is.logical(y[[lc]]) || is.character(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is integer or numeric, the corresponding column in y ('",icnam,"') can not be character or logical types, but found incompatible type '",typeof(y[[lc]]),"'.")
}
}
ux = unique(shallow(x, by.x))
uy = unique(shallow(y, by.y))
ix = duplicated(rbind(uy, ux, use.names=use.names, fill=FALSE))[-seq_len(nrow(uy))]
.Call(CsubsetDT, ux, which_(ix, FALSE), seq_along(ux)) # more memory efficient version of which(!ix)
}
# set operators ----
funique <- function(x) {
stopifnot(is.data.table(x))
dup = duplicated(x)
if (any(dup)) .Call(CsubsetDT, x, which_(dup, FALSE), seq_along(x)) else x
}
fintersect <- function(x, y, all=FALSE) {
if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
if (!identical(names(x), names(y))) stop("x and y must have same column order")
bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
if (".seqn" %in% names(x)) stop("None of the datasets to intersect should contain a column named '.seqn'")
if (!nrow(x) || !nrow(y)) return(x[0L])
if (all) {
x = shallow(x)[, ".seqn" := rowidv(x)]
y = shallow(y)[, ".seqn" := rowidv(y)]
jn.on = c(".seqn",setdiff(names(x),".seqn"))
x[y, .SD, .SDcols=setdiff(names(x),".seqn"), nomatch=0L, on=jn.on]
} else {
x[funique(y), nomatch=0L, on=names(x), mult="first"]
}
}
fsetdiff <- function(x, y, all=FALSE) {
if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
if (!identical(names(x), names(y))) stop("x and y must have same column order")
bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
if (".seqn" %in% names(x)) stop("None of the datasets to setdiff should contain a column named '.seqn'")
if (!nrow(x)) return(x)
if (!nrow(y)) return(if (!all) funique(x) else x)
if (all) {
x = shallow(x)[, ".seqn" := rowidv(x)]
y = shallow(y)[, ".seqn" := rowidv(y)]
jn.on = c(".seqn",setdiff(names(x),".seqn"))
x[!y, .SD, .SDcols=setdiff(names(x),".seqn"), on=jn.on]
} else {
funique(x[!y, on=names(x)])
}
}
funion <- function(x, y, all=FALSE) {
if (!is.logical(all) || length(all) != 1L) stop("argument 'all' should be logical of length one")
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
if (!identical(names(x), names(y))) stop("x and y must have same column order")
bad.type = setNames(c("raw","complex", if(!all) "list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex", if(!all) "list"))
if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
ans = rbindlist(list(x, y))
if (!all) ans = funique(ans)
ans
}
fsetequal <- function(x, y) {
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
if (!identical(sort(names(x)), sort(names(y)))) stop("x and y must have same column names")
if (!identical(names(x), names(y))) stop("x and y must have same column order")
bad.type = setNames(c("raw","complex","list") %chin% c(vapply(x, typeof, FUN.VALUE = ""), vapply(y, typeof, FUN.VALUE = "")), c("raw","complex","list"))
if (any(bad.type)) stop(sprintf("x and y must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
if (!identical(lapply(x, class), lapply(y, class))) stop("x and y must have same column classes")
isTRUE(all.equal.data.table(x, y, check.attributes = FALSE, ignore.row.order = TRUE))
}
# all.equal ----
all.equal.data.table <- function(target, current, trim.levels=TRUE, check.attributes=TRUE, ignore.col.order=FALSE, ignore.row.order=FALSE, tolerance=sqrt(.Machine$double.eps), ...) {
stopifnot(is.logical(trim.levels), is.logical(check.attributes), is.logical(ignore.col.order), is.logical(ignore.row.order), is.numeric(tolerance))
if (!is.data.table(target) || !is.data.table(current)) stop("'target' and 'current' must be both data.tables")
msg = character(0L)
# init checks that detect high level all.equal
if (nrow(current) != nrow(target)) msg = "Different number of rows"
if (ncol(current) != ncol(target)) msg = c(msg, "Different number of columns")
diff.colnames = !identical(sort(names(target)), sort(names(current)))
diff.colorder = !identical(names(target), names(current))
if (check.attributes && diff.colnames) msg = c(msg, "Different column names")
if (!diff.colnames && !ignore.col.order && diff.colorder) msg = c(msg, "Different column order")
if (length(msg)) return(msg) # skip check.attributes and further heavy processing
# ignore.col.order
if (ignore.col.order && diff.colorder) current = setcolorder(shallow(current), names(target))
# Always check modes equal, like base::all.equal
targetModes = vapply_1c(target, mode)
currentModes = vapply_1c(current, mode)
if (any( d<-(targetModes!=currentModes) )) {
w = head(which(d),3L)
return(paste0("Datasets have different column modes. First 3: ",paste(
paste0(names(targetModes)[w],"(",paste(targetModes[w],currentModes[w],sep="!="),")")
,collapse=" ")))
}
if (check.attributes) {
squashClass = function(x) if (is.object(x)) paste(class(x),collapse=";") else mode(x)
# else mode() is so that integer==numeric, like base all.equal does.
targetTypes = vapply_1c(target, squashClass)
currentTypes = vapply_1c(current, squashClass)
if (length(targetTypes) != length(currentTypes))
stop("Internal error: ncol(current)==ncol(target) was checked above")
if (any( d<-(targetTypes != currentTypes))) {
w = head(which(d),3L)
return(paste0("Datasets have different column classes. First 3: ",paste(
paste0(names(targetTypes)[w],"(",paste(targetTypes[w],currentTypes[w],sep="!="),")")
,collapse=" ")))
}
}
if (check.attributes) {
# check key
k1 = key(target)
k2 = key(current)
if (!identical(k1, k2)) {
return(sprintf("Datasets has different keys. 'target'%s. 'current'%s.",
if(length(k1)) paste0(": ", paste(k1, collapse=", ")) else " has no key",
if(length(k2)) paste0(": ", paste(k2, collapse=", ")) else " has no key"))
}
# check index
i1 = indices(target)
i2 = indices(current)
if (!identical(i1, i2)) {
return(sprintf("Datasets has different indexes. 'target'%s. 'current'%s.",
if(length(i1)) paste0(": ", paste(i1, collapse=", ")) else " has no index",
if(length(i2)) paste0(": ", paste(i2, collapse=", ")) else " has no index"))
}
# Trim any extra row.names attributes that came from some inheritence
# Trim ".internal.selfref" as long as there is no `all.equal.externalptr` method
exclude.attrs = function(x, attrs = c("row.names",".internal.selfref")) x[!names(x) %in% attrs]
a1 = exclude.attrs(attributes(target))
a2 = exclude.attrs(attributes(current))
if (length(a1) != length(a2)) return(sprintf("Datasets has different number of (non-excluded) attributes: target %s, current %s", length(a1), length(a2)))
if (!identical(nm1 <- sort(names(a1)), nm2 <- sort(names(a2)))) return(sprintf("Datasets has attributes with different names: %s", paste(setdiff(union(names(a1), names(a2)), intersect(names(a1), names(a2))), collapse=", ")))
attrs.r = all.equal(a1[nm1], a2[nm2], ..., check.attributes = check.attributes)
if (is.character(attrs.r)) return(paste("Attributes: <", attrs.r, ">")) # skip further heavy processing
}
if (ignore.row.order) {
if (".seqn" %in% names(target))
stop("None of the datasets to compare should contain a column named '.seqn'")
bad.type = setNames(c("raw","complex","list") %chin% c(vapply(current, typeof, FUN.VALUE = ""), vapply(target, typeof, FUN.VALUE = "")), c("raw","complex","list"))
if (any(bad.type))
stop(sprintf("Datasets to compare with 'ignore.row.order' must not have unsupported column types: %s", paste(names(bad.type)[bad.type], collapse=", ")))
if (between(tolerance, 0, sqrt(.Machine$double.eps), incbounds=FALSE)) {
warning(sprintf("Argument 'tolerance' was forced to lowest accepted value `sqrt(.Machine$double.eps)` from provided %s", format(tolerance, scientific=FALSE)))
tolerance = sqrt(.Machine$double.eps)
}
target_dup = as.logical(anyDuplicated(target))
current_dup = as.logical(anyDuplicated(current))
tolerance.msg = if (identical(tolerance, 0)) ", be aware you are using `tolerance=0` which may result into visually equal data" else ""
if (target_dup || current_dup) {
# handling 'tolerance' for duplicate rows - those `msg` will be returned only when equality with tolerance will fail
if (any(vapply_1c(target,typeof)=="double") && !identical(tolerance, 0)) {
if (target_dup && !current_dup) msg = c(msg, "Dataset 'target' has duplicate rows while 'current' doesn't")
else if (!target_dup && current_dup) msg = c(msg, "Dataset 'current' has duplicate rows while 'target' doesn't")
else { # both
if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
stop("Duplicate rows in datasets, numeric columns and ignore.row.order cannot be used with non 0 tolerance argument")
msg = c(msg, "Both datasets have duplicate rows, they also have numeric columns, together with ignore.row.order this force 'tolerance' argument to 0")
tolerance = 0
}
} else { # no numeric columns or tolerance==0L
if (target_dup && !current_dup)
return(sprintf("Dataset 'target' has duplicate rows while 'current' doesn't%s", tolerance.msg))
if (!target_dup && current_dup)
return(sprintf("Dataset 'current' has duplicate rows while 'target' doesn't%s", tolerance.msg))
}
}
jn.on = if (target_dup && current_dup) {
target = shallow(target)[, ".seqn" := rowidv(target)]
current = shallow(current)[, ".seqn" := rowidv(current)]
c(".seqn", setdiff(names(target), ".seqn"))
} else names(target)
# handling 'tolerance' for factor cols - those `msg` will be returned only when equality with tolerance will fail
if (any(vapply_1b(target,is.factor)) && !identical(tolerance, 0)) {
if (!identical(tolerance, sqrt(.Machine$double.eps))) # non-default will raise error
stop("Factor columns and ignore.row.order cannot be used with non 0 tolerance argument")
msg = c(msg, "Using factor columns together together with ignore.row.order, this force 'tolerance' argument to 0")
tolerance = 0
}
# roll join to support 'tolerance' argument, conditional to retain support for factor when tolerance=0
ans = if (identical(tolerance, 0)) target[current, nomatch=NA, which=TRUE, on=jn.on] else {
ans1 = target[current, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
ans2 = target[current, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]
pmin(ans1, ans2, na.rm=TRUE)
}
if (any_na(as_list(ans))) {
msg = c(msg, sprintf("Dataset 'current' has rows not present in 'target'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
return(msg)
}
ans = if (identical(tolerance, 0)) current[target, nomatch=NA, which=TRUE, on=jn.on] else {
ans1 = current[target, roll=tolerance, rollends=TRUE, which=TRUE, on=jn.on]
ans2 = current[target, roll=-tolerance, rollends=TRUE, which=TRUE, on=jn.on]
pmin(ans1, ans2, na.rm=TRUE)
}
if (any_na(as_list(ans))) {
msg = c(msg, sprintf("Dataset 'target' has rows not present in 'current'%s%s", if (target_dup || current_dup) " or present in different quantity" else "", tolerance.msg))
return(msg)
}
} else {
for (i in seq_along(target)) {
# trim.levels moved here
x = target[[i]]
y = current[[i]]
if (xor(is.factor(x),is.factor(y)))
return("Internal error: factor type mismatch should have been caught earlier")
cols.r = TRUE
if (is.factor(x)) {
if (!identical(levels(x),levels(y))) {
if (trim.levels) {
# do this regardless of check.attributes (that's more about classes, checked above)
x = factor(x)
y = factor(y)
if (!identical(levels(x),levels(y)))
cols.r = "Levels not identical even after refactoring since trim.levels is TRUE"
} else {
cols.r = "Levels not identical. No attempt to refactor because trim.levels is FALSE"
}
} else {
cols.r = all.equal(x, y, check.attributes=check.attributes)
# the check.attributes here refers to everything other than the levels, which are always
# dealt with according to trim.levels
}
} else {
cols.r = all.equal(unclass(x), unclass(y), tolerance=tolerance, ..., check.attributes=check.attributes)
# classes were explicitly checked earlier above, so ignore classes here.
}
if (!isTRUE(cols.r)) return(paste0("Column '", names(target)[i], "': ", paste(cols.r,collapse=" ")))
}
}
TRUE
}