diff --git a/NEWS.md b/NEWS.md index b0c5a53e..5b71d8b2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -22,6 +22,7 @@ - `average_loss()` also returns a "hstats_matrix" object with `print()` and `plot()` method. The values can be extracted via `$M`. - The default `v` of `hstats()` and `perm_importance()` is now `NULL`. Internally, it is set to `colnames(X)` (minus the column names of `w` and `y` if passed as name). - Missing grid values: `partial_dep()` and `ice()` have received a `na.rm` argument that controls if missing values are dropped during grid creation. The default `TRUE` is compatible with earlier releases. +- Missing values in `hstats()`: Discrete variables with missings would cause `rowsum()` to launch repeated warnings. This case is now catched. - The position of some function arguments have changed. # hstats 0.3.0 diff --git a/R/pd_raw.R b/R/pd_raw.R index 348ef86b..0f6f4d60 100644 --- a/R/pd_raw.R +++ b/R/pd_raw.R @@ -115,14 +115,18 @@ ice_raw <- function(object, v, X, grid, pred_fun = stats::predict, if (!any(X_dup)) { return(list(X = X, w = w)) # No optimization done } - - # Compress + + # Compensate via w if (is.null(w)) { w <- rep(1.0, times = nrow(X)) } + if (anyNA(x_not_v)) { + # rowsum() warns about NA in group = x_not_v -> integer encode + x_not_v <- match(x_not_v, x_not_v[!X_dup]) + } list( X = X[!X_dup, , drop = FALSE], - w = c(rowsum(w, group = x_not_v, reorder = FALSE)) # warning if missing in x_not_v + w = c(rowsum(w, group = x_not_v, reorder = FALSE)) ) } diff --git a/README.md b/README.md index 5243701f..cb4f5f5b 100644 --- a/README.md +++ b/README.md @@ -102,11 +102,10 @@ average_loss(fit, X = X_valid, y = y_valid) Let's calculate different H-statistics via `hstats()`: ```r -# 4 seconds on simple laptop - a random forest will take 2-3 minutes -# With quant_approx = 25 (dense features are binned into 25 bins): 1.5 s +# 4 seconds on simple laptop - a random forest will take 2 minutes set.seed(782) system.time( - s <- hstats(fit, X = X_train) + s <- hstats(fit, X = X_train) #, approx = TRUE: twice as fast ) s # H^2 (normalized) diff --git a/tests/testthat/test_partial_dep.R b/tests/testthat/test_partial_dep.R index 542c508c..d5e51a50 100644 --- a/tests/testthat/test_partial_dep.R +++ b/tests/testthat/test_partial_dep.R @@ -84,13 +84,12 @@ test_that("pd_raw() also works for multioutput situations", { test_that("pd_raw() works with missings (all compressions on)", { X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5) + out <- pd_raw(1, v = "a", X = X, pred_fun = function(m, x) x[, "b"], grid = c(NA, 1)) expect_equal(drop(out), rep(mean(X[, "b"]), times = 2L)) - expect_warning( - out <- pd_raw(1, v = "b", X = X, pred_fun = function(m, x) x[, "b"], grid = 1:5) - ) - expect_equal(drop(out), 1:5) + out <- pd_raw(1, v = "b", X = X, pred_fun = function(m, x) x[, "b"], grid = 5:1) + expect_equal(drop(out), 5:1) }) # Now, partial_dep() @@ -445,6 +444,36 @@ test_that(".compress_X() leaves X unchanged if not exactly 1 non-grid variable", expect_equal(out$w, NULL) }) +test_that(".compress_X() works with missing values", { + # Note that b is not used after compression + + # data.frame + X <- data.frame(a = c(NA, NA, NA, 1, 1), b = 1:5) + out_df <- data.frame(a = c(NA, 1), b = c(1, 4), row.names = c(1L, 4L)) + out <- .compress_X(X, v = "b") + expect_equal(out$X, out_df) + expect_equal(out$w, c(3, 2)) + + # Matrix + X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5) + out_m <- cbind(a = c(NA, 1), b = c(1, 4)) + out <- .compress_X(X, v = "b") + expect_equal(out$X, out_m) + expect_equal(out$w, c(3, 2)) + + # Factor case + a <- factor(c(NA, NA, "B", "B", NA, "A")) + X <- data.frame(a = a, b = 1:6) + out_df <- data.frame( + a = factor(c(NA, "B", "A"), levels = levels(a)), + b = c(1, 3, 6), + row.names = c(1L, 3L, 6L) + ) + out <- .compress_X(X, v = "b") + expect_equal(out$X, out_df) + expect_equal(out$w, 3:1) +}) + test_that(".compress_grid() works with missing values in grid", { g <- c(2, 2, NA, 1, NA) gg <- .compress_grid(g) @@ -494,21 +523,3 @@ test_that(".compress_grid() leaves grid unchanged if unique", { expect_equal(out$grid, g) expect_equal(out$reindex, NULL) }) - -test_that(".compress_X() works with missing values", { - # Note that b is not used after compression - - # data.frame - X <- data.frame(a = c(NA, NA, NA, 1, 1), b = 1:5) - out_df <- data.frame(a = c(NA, 1), b = c(1, 4), row.names = c(1L, 4L)) - expect_warning(out <- .compress_X(X, v = "b")) - expect_equal(out$X, out_df) - expect_equal(out$w, c(3, 2)) - - # Matrix - X <- cbind(a = c(NA, NA, NA, 1, 1), b = 1:5) - out_m <- cbind(a = c(NA, 1), b = c(1, 4)) - expect_warning(out <- .compress_X(X, v = "b")) - expect_equal(out$X, out_m) - expect_equal(out$w, c(3, 2)) -})