Skip to content

Commit

Permalink
Merge pull request #84 from mayer79/rowsum_warning
Browse files Browse the repository at this point in the history
Rowsum warning
  • Loading branch information
mayer79 authored Oct 21, 2023
2 parents aba883f + 894ce95 commit 7a97c03
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 28 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
- `average_loss()` also returns a "hstats_matrix" object with `print()` and `plot()` method. The values can be extracted via `$M`.
- The default `v` of `hstats()` and `perm_importance()` is now `NULL`. Internally, it is set to `colnames(X)` (minus the column names of `w` and `y` if passed as name).
- Missing grid values: `partial_dep()` and `ice()` have received a `na.rm` argument that controls if missing values are dropped during grid creation. The default `TRUE` is compatible with earlier releases.
- Missing values in `hstats()`: Discrete variables with missings would cause `rowsum()` to launch repeated warnings. This case is now catched.
- The position of some function arguments have changed.

# hstats 0.3.0
Expand Down
10 changes: 7 additions & 3 deletions R/pd_raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,18 @@ ice_raw <- function(object, v, X, grid, pred_fun = stats::predict,
if (!any(X_dup)) {
return(list(X = X, w = w)) # No optimization done
}
# Compress

# Compensate via w
if (is.null(w)) {
w <- rep(1.0, times = nrow(X))
}
if (anyNA(x_not_v)) {
# rowsum() warns about NA in group = x_not_v -> integer encode
x_not_v <- match(x_not_v, x_not_v[!X_dup])
}
list(
X = X[!X_dup, , drop = FALSE],
w = c(rowsum(w, group = x_not_v, reorder = FALSE)) # warning if missing in x_not_v
w = c(rowsum(w, group = x_not_v, reorder = FALSE))
)
}

Expand Down
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,10 @@ average_loss(fit, X = X_valid, y = y_valid)
Let's calculate different H-statistics via `hstats()`:

```r
# 4 seconds on simple laptop - a random forest will take 2-3 minutes
# With quant_approx = 25 (dense features are binned into 25 bins): 1.5 s
# 4 seconds on simple laptop - a random forest will take 2 minutes
set.seed(782)
system.time(
s <- hstats(fit, X = X_train)
s <- hstats(fit, X = X_train) #, approx = TRUE: twice as fast
)
s
# H^2 (normalized)
Expand Down
55 changes: 33 additions & 22 deletions tests/testthat/test_partial_dep.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,12 @@ test_that("pd_raw() also works for multioutput situations", {

test_that("pd_raw() works with missings (all compressions on)", {
X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5)

out <- pd_raw(1, v = "a", X = X, pred_fun = function(m, x) x[, "b"], grid = c(NA, 1))
expect_equal(drop(out), rep(mean(X[, "b"]), times = 2L))

expect_warning(
out <- pd_raw(1, v = "b", X = X, pred_fun = function(m, x) x[, "b"], grid = 1:5)
)
expect_equal(drop(out), 1:5)
out <- pd_raw(1, v = "b", X = X, pred_fun = function(m, x) x[, "b"], grid = 5:1)
expect_equal(drop(out), 5:1)
})

# Now, partial_dep()
Expand Down Expand Up @@ -445,6 +444,36 @@ test_that(".compress_X() leaves X unchanged if not exactly 1 non-grid variable",
expect_equal(out$w, NULL)
})

test_that(".compress_X() works with missing values", {
# Note that b is not used after compression

# data.frame
X <- data.frame(a = c(NA, NA, NA, 1, 1), b = 1:5)
out_df <- data.frame(a = c(NA, 1), b = c(1, 4), row.names = c(1L, 4L))
out <- .compress_X(X, v = "b")
expect_equal(out$X, out_df)
expect_equal(out$w, c(3, 2))

# Matrix
X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5)
out_m <- cbind(a = c(NA, 1), b = c(1, 4))
out <- .compress_X(X, v = "b")
expect_equal(out$X, out_m)
expect_equal(out$w, c(3, 2))

# Factor case
a <- factor(c(NA, NA, "B", "B", NA, "A"))
X <- data.frame(a = a, b = 1:6)
out_df <- data.frame(
a = factor(c(NA, "B", "A"), levels = levels(a)),
b = c(1, 3, 6),
row.names = c(1L, 3L, 6L)
)
out <- .compress_X(X, v = "b")
expect_equal(out$X, out_df)
expect_equal(out$w, 3:1)
})

test_that(".compress_grid() works with missing values in grid", {
g <- c(2, 2, NA, 1, NA)
gg <- .compress_grid(g)
Expand Down Expand Up @@ -494,21 +523,3 @@ test_that(".compress_grid() leaves grid unchanged if unique", {
expect_equal(out$grid, g)
expect_equal(out$reindex, NULL)
})

test_that(".compress_X() works with missing values", {
# Note that b is not used after compression

# data.frame
X <- data.frame(a = c(NA, NA, NA, 1, 1), b = 1:5)
out_df <- data.frame(a = c(NA, 1), b = c(1, 4), row.names = c(1L, 4L))
expect_warning(out <- .compress_X(X, v = "b"))
expect_equal(out$X, out_df)
expect_equal(out$w, c(3, 2))

# Matrix
X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5)
out_m <- cbind(a = c(NA, 1), b = c(1, 4))
expect_warning(out <- .compress_X(X, v = "b"))
expect_equal(out$X, out_m)
expect_equal(out$w, c(3, 2))
})

0 comments on commit 7a97c03

Please sign in to comment.