Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved factor handling #128

Merged
merged 4 commits into from
May 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
* :fire: hot fixes for scenario scaling and normalization issue #113
* :bug: fix so that projection works with different extents than used for inference.
* :bug: fix that prevented `BART` models to be saved/loaded from disk #127.
* :bug: fixes related to `factor` handling for all engines.

# ibis.iSDM 0.1.2

Expand Down
2 changes: 2 additions & 0 deletions R/add_predictors.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ NULL
#' Certain names such \code{"offset"} are forbidden as predictor variable names.
#' The function will return an error message if these are used.
#'
#' Some engines use binary variables regardless of the parameter \code{explode_factors}
#' set here.
#'
#' @examples
#' \dontrun{
Expand Down
43 changes: 28 additions & 15 deletions R/engine_bart.R
Original file line number Diff line number Diff line change
Expand Up @@ -250,31 +250,44 @@ engine_bart <- function(x,
# Check if there any factors, if yes split up
if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){
vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(train_cov[[vf]], name = vf)
# Remove variables from train_cov and append
train_cov[[vf]] <- NULL
train_cov <- cbind(train_cov, z)
# explode all present factors
for (i in 1:length(vf)) {
z <- explode_factor(train_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov
train_cov[[vf[i]]] <- NULL
train_cov <- cbind(train_cov, z)
# append to types data.frame
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
# Also update the formula
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation,
paste0(". ~ . -", vf[i]))
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation,
paste0(". ~ . +", paste0(colnames(z), collapse = "+")))
}
# update containers
model$biodiversity[[1]]$predictors <- train_cov # Save new in model object
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))

# Also update the formula
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf))
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+")))
model$biodiversity[[1]]$predictors_types <- dplyr::filter(model$biodiversity[[1]]$predictors_types,
!predictors %in% vf)
}

# Prediction container
pred_cov <- model$predictors[,c('x','y',model$biodiversity[[1]]$predictors_names)]
if(any(model$predictors_types$type=='factor')){
vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(pred_cov[[vf]], name = vf)
# Remove variables from train_cov and append
pred_cov[[vf]] <- NULL
pred_cov <- cbind(pred_cov, z)
for (i in 1:length(vf)) {
z <- explode_factor(pred_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov and append
pred_cov[[vf[i]]] <- NULL
pred_cov <- cbind(pred_cov, z)
model$predictors_types <- rbind(model$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
}

pred_cov <- pred_cov[,c("x", "y", colnames(train_cov))]
model$predictors <- pred_cov # Save new in model object
model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))

model$biodiversity[[1]]$predictors_names <- colnames(train_cov)
model$predictors_names <- colnames(pred_cov)
assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) ))
Expand Down
49 changes: 31 additions & 18 deletions R/engine_breg.R
Original file line number Diff line number Diff line change
Expand Up @@ -255,31 +255,44 @@ engine_breg <- function(x,
# Check if there any factors, if yes split up
if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){
vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(train_cov[[vf]], name = vf)
# Remove variables from train_cov and append
train_cov[[vf]] <- NULL
train_cov <- cbind(train_cov, z)
# explode all present factors
for (i in 1:length(vf)) {
z <- explode_factor(train_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov
train_cov[[vf[i]]] <- NULL
train_cov <- cbind(train_cov, z)
# append to types data.frame
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
# Also update the formula
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation,
paste0(". ~ . -", vf[i]))
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation,
paste0(". ~ . +", paste0(colnames(z), collapse = "+")))
}
# update containers
model$biodiversity[[1]]$predictors <- train_cov # Save new in model object
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))

# Also update the formula
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . -", vf))
model$biodiversity[[1]]$equation <- stats::update.formula(model$biodiversity[[1]]$equation, paste0(". ~ . +", paste0(colnames(z),collapse = "+")))
model$biodiversity[[1]]$predictors_types <- dplyr::filter(model$biodiversity[[1]]$predictors_types,
!predictors %in% vf)
}

# Prediction container
pred_cov <- model$predictors[,model$biodiversity[[1]]$predictors_names]
pred_cov <- model$predictors[,c('x','y',model$biodiversity[[1]]$predictors_names)]
if(any(model$predictors_types$type=='factor')){
vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(pred_cov[[vf]], name = vf)
# Remove variables from train_cov and append
pred_cov[[vf]] <- NULL
pred_cov <- cbind(pred_cov, z)
pred_cov <- pred_cov[,colnames(train_cov)]
model$predictors <- pred_cov# Save new in model object
model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))
for (i in 1:length(vf)) {
z <- explode_factor(pred_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov and append
pred_cov[[vf[i]]] <- NULL
pred_cov <- cbind(pred_cov, z)
model$predictors_types <- rbind(model$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
}

pred_cov <- pred_cov[,c("x", "y", colnames(train_cov))]
model$predictors <- pred_cov # Save new in model object

model$biodiversity[[1]]$predictors_names <- colnames(train_cov)
model$predictors_names <- colnames(pred_cov)
assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) ))
Expand Down
3 changes: 2 additions & 1 deletion R/engine_gdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,8 @@ engine_gdb <- function(x,

# Check that factors have been correctly set if any
if(any(model$predictors_types$type=="factor")){
df[,model$predictors_types$predictors[model$predictors_types$type=="factor"]] <- factor(df[,model$predictors_types$predictors[model$predictors_types$type=="factor"]])
df[,model$predictors_types$predictors[model$predictors_types$type=="factor"]] <-
lapply(df[,model$predictors_types$predictors[model$predictors_types$type=="factor"]], as.factor)
}

# Overwrite observation data
Expand Down
3 changes: 1 addition & 2 deletions R/engine_glm.R
Original file line number Diff line number Diff line change
Expand Up @@ -296,8 +296,7 @@ engine_glm <- function(x,

# Subset the predictor types to only those present
te <- formula_terms(form)
model$biodiversity[[1]]$predictors_types <-
model$biodiversity[[1]]$predictors_types |> dplyr::filter(predictors %in% te)
model$biodiversity[[1]]$predictors_types <- dplyr::filter(model$biodiversity[[1]]$predictors_types, predictors %in% te)
model$biodiversity[[1]]$predictors_names <- intersect(model$biodiversity[[1]]$predictors_names, te)

# Get offset and add it to exposure
Expand Down
18 changes: 14 additions & 4 deletions R/engine_glmnet.R
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,7 @@ engine_glmnet <- function(x,

# Subset the predictor types to only those present
te <- formula_terms(form)
model$biodiversity[[1]]$predictors_types <-
model$biodiversity[[1]]$predictors_types |> dplyr::filter(predictors %in% te)
model$biodiversity[[1]]$predictors_types <- dplyr::filter(model$biodiversity[[1]]$predictors_types, predictors %in% te)
model$biodiversity[[1]]$predictors_names <- intersect(model$biodiversity[[1]]$predictors_names, te)

# Get offset and add it to exposure
Expand All @@ -353,15 +352,26 @@ engine_glmnet <- function(x,
if(any(model$predictors_types$type=="factor")){
fac <- model$biodiversity[[1]]$predictors_names[which(model$biodiversity[[1]]$predictors_types$type=="factor")]
# return penalty factor for each level of each factor (even if level values are identical across factors)
p.fac <- c(p.fac, rep(1, sum(apply(df[, fac, drop = FALSE], 2, function(x) length(unique(x))))))
p.fac.fac <- rep(1, sum(apply(df[, fac, drop = FALSE], 2, function(x) length(unique(x)))))
# use name and levels as name
names(p.fac.fac) <- c(sapply(fac, function(x) paste(x, levels(model$predictors[, x]), sep = "."),
USE.NAMES = FALSE))
p.fac <- c(p.fac, p.fac.fac)
}
# Duplicate p.fac container for lower and upper limits
lowlim <- rep(-Inf, length(p.fac)) |> stats::setNames(names(p.fac))
upplim <- rep(Inf, length(p.fac)) |> stats::setNames(names(p.fac))

# Trick for creation for some default lambda values for the regularization multiplier
if(is.null(params$lambda)){
reg <- default.regularization(p = df$observed, m = stats::model.matrix(form, df)) * c(1, p.fac) # add 1 for the intercept
# helper fun for naming
cc <- function(x) {colnames(x) <- paste0(".", colnames(x)); x}
# helper fun for contrasts
cont_arg <- lapply(df[, sapply(df, is.factor), drop = FALSE], function(x) cc(stats::contrasts(x, contrasts = FALSE)))
m_mat <- stats::model.matrix(form, df, contrasts.arg = cont_arg)
reg <- default.regularization(p = df$observed, m = m_mat)
# make sure same order
reg <- reg[sort(names(reg))] * c("(Intercept)" = 1, p.fac[sort(names(p.fac))]) # add 1 for the intercept
params$lambda <- 10^(seq(4, 0, length.out = 200)) * sum(p.fac)/length(p.fac) * sum(p.fac)/sum(w)
if(anyNA(params$lambda)) params$lambda <- NULL
}
Expand Down
37 changes: 25 additions & 12 deletions R/engine_xgboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -302,13 +302,20 @@ engine_xgboost <- function(x,
# Check if there any factors, if yes split up
if(any(model$biodiversity[[1]]$predictors_types$type=='factor')){
vf <- model$biodiversity[[1]]$predictors_types$predictors[which(model$biodiversity[[1]]$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(train_cov[[vf]], name = vf)
# Remove variables from train_cov and append
train_cov[[vf]] <- NULL
train_cov <- cbind(train_cov, z)
# explode all present factors
for (i in 1:length(vf)) {
z <- explode_factor(train_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov
train_cov[[vf[i]]] <- NULL
train_cov <- cbind(train_cov, z)
# append to types data.frame
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
}
# update containers
model$biodiversity[[1]]$predictors <- train_cov # Save new in model object
model$biodiversity[[1]]$predictors_types <- rbind(model$biodiversity[[1]]$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))
model$biodiversity[[1]]$predictors_types <- dplyr::filter(model$biodiversity[[1]]$predictors_types,
!predictors %in% vf)
}
train_cov <- as.matrix( train_cov )
labels <- model$biodiversity[[1]]$observations$observed
Expand Down Expand Up @@ -337,14 +344,20 @@ engine_xgboost <- function(x,
if(any(model$predictors_types$type=='factor')){
vf <- model$predictors_types$predictors[which(model$predictors_types$type == "factor")]
# Get factors
z <- explode_factor(pred_cov[[vf]], name = vf)
# Remove variables from train_cov and append
pred_cov[[vf]] <- NULL
pred_cov <- cbind(pred_cov, z)
for (i in 1:length(vf)) {
z <- explode_factor(pred_cov[[vf[i]]], name = vf[i])
# Remove variables from train_cov and append
pred_cov[[vf[i]]] <- NULL
pred_cov <- cbind(pred_cov, z)
model$predictors_types <- rbind(model$predictors_types,
data.frame(predictors = colnames(z), type = "numeric"))
}

model$predictors <- pred_cov # Save new in model object
model$predictors_types <- rbind(model$predictors_types, data.frame(predictors = colnames(z), type = "numeric"))
model$biodiversity[[1]]$predictors_names <- colnames(pred_cov)

model$biodiversity[[1]]$predictors_names <- colnames(train_cov)
model$predictors_names <- colnames(pred_cov)
assertthat::assert_that(all( colnames(train_cov) %in% colnames(pred_cov) ))
}
pred_cov <- as.matrix( pred_cov )
# Ensure that the column names are identical for both
Expand Down
Loading