diff --git a/R/build_projection_model_opt_plot.r b/R/build_projection_model_opt_plot.r deleted file mode 100644 index 2fe3521..0000000 --- a/R/build_projection_model_opt_plot.r +++ /dev/null @@ -1,60 +0,0 @@ -#' Model future uptake from current uptake -#' -#' BEWARE: THIS DOES NOT YET ACCOUNT FOR MULTI-YEAR OR MULTI-STATE DATA SETS -#' -#' Future incident uptake can be predicted from current incident uptake. -#' A model can be built and fit to project incident uptake into the future. -#' -#' The default model says that current incident uptake is a linear function -#' of the previous incident uptake, time elapsed since rollout, and their -#' interaction. -#' -#' All modeling is performed on the daily-average incident uptake to -#' remove accumulation dependence among data points and to -#' account for slight variations in incident reporting intervals. -#' The first data point may be dropped before fitting the model -#' if there is a long lag between the vaccine rollout date and -#' the date that data collection began. -#' All variables are standardized prior to fitting the model. -#' -#' UPDATES FOR THE FUTURE: -#' - Model must incorporate year-to-year variability -#' - Model must incorporate state-to-state variability -#' - A variety of models should be tried and competed, or formula provided -#' -#' @description Fit a model to project uptake into the future -#' -#' @param data uptake data to fit the projection model -#' @param drop_first whether to drop the first data point -#' -#' @return a brms model object -#' -#' @export -#' -build_projection_model2 <- function(data, drop_first = NULL, output = F) { - if (is.null(drop_first)) { - intervals <- diff(data$elapsed) - if (abs(data$elapsed[1] - mean(intervals)) > (sd(intervals) + 1)) { - drop_first <- TRUE - } else { - drop_first <- FALSE - intervals <- c(data$elapsed[1], intervals) - } - } - - if (drop_first) { - data <- data[-1, ] - } - - model <- brms::brm(daily_std ~ previous_std * elapsed_std, - data = data, - family = gaussian() - ) - - if (output) { - print(summary(model)) - plot(model) - } - - return(model) -} diff --git a/R/build_scaling_model.r b/R/build_scaling_model.r deleted file mode 100644 index 8e0f1f4..0000000 --- a/R/build_scaling_model.r +++ /dev/null @@ -1,89 +0,0 @@ -#' Model one source of uptake data from another -#' -#' Different sources of uptake data may systematically disagree. -#' A model can be built and fit to scale one source to the other. -#' -#' The default model says that the uptake data from the target source -#' is a linear function of the input data, time elapsed since rollout, -#' and their interaction. If multiple states are represented, each state -#' is modeled separately via group-level variation of the intercept and -#' the input data coefficients. -#' -#' All modeling is performed on the daily-average incident uptake to -#' remove accumulation dependence among data points and to -#' account for slight variations in incident reporting intervals. -#' The first data point may be dropped before fitting the model -#' if there is a long lag between the vaccine rollout date and -#' the date that data collection began. -#' All variables are standardized prior to fitting the model. -#' -#' UPDATES FOR THE FUTURE: -#' - Model must incorporate year-to-year variability -#' - A variety of models should be tried and competed, or formula provided -#' -#' @description Fit a model to scale uptake date from one source to another -#' -#' @param input uptake data to use as a predictor -#' @param target uptake data to use as the outcome -#' @param drop_first whether to drop the first data point -#' -#' @return a brms model object -#' -#' @export -#' -build_scaling_model <- function(input, target, drop_first = NULL) { - - if (is.null(drop_first)) { - input_intervals <- c(input$elapsed[1], diff(input$elapsed)) - target_intervals <- c(target$elapsed[1], diff(target$elapsed)) - - common_dates <- intersect(input$elapsed, target$elapsed) - - input_intervals <- input_intervals[input$elapsed %in% common_dates] - target_intervals <- target_intervals[target$elapsed %in% common_dates] - - if ((input_intervals[1] - mean(input_intervals[-1])) > - (sd(input_intervals[-1]) + 1) || - (target_intervals[1] - mean(target_intervals[-1])) > - (sd(target_intervals[-1]) + 1)) { - drop_first <- TRUE - } else { - drop_first <- FALSE - } - } - - input <- dplyr::semi_join(input, target, by = "date") - target <- dplyr::semi_join(target, input, by = "date") - - if (drop_first) { - input <- input[-1, ] - target <- target[-1, ] - } - - if (nrow(input) != nrow(target)) { - stop("There are state or date mismatches between the input and target data") - } - - input$daily_std <- (input$daily - mean(input$daily)) / - sd(input$daily) - input$elapsed_std <- (input$elapsed - mean(input$elapsed)) / - sd(input$elapsed) - input$target_daily_std <- (target$daily - mean(target$daily)) / - sd(target$daily) - - if (length(unique(input$state)) == 1) { - model <- brms::brm(target_daily_std ~ daily_std * elapsed_std, - data = input, - family = gaussian()) - } else { - model <- brms::brm(target_daily_std ~ daily_std * elapsed_std + - (1 + daily_std | state), - data = input, - family = gaussian()) - } - - print(summary(model)) - plot(model) - - return(model) -} \ No newline at end of file diff --git a/R/evaluate_projection.r b/R/evaluate_projection.r deleted file mode 100644 index 796c52f..0000000 --- a/R/evaluate_projection.r +++ /dev/null @@ -1,215 +0,0 @@ -#' Evaluate a projection model -#' -#' To evaluate how successfully a projection model predicts future uptake, -#' plots of predicted incident and cumulative uptake are generated, along with -#' the data against which the model is being compared. The model ought to -#' match the data closely, although uncertainty may propagate through time. -#' Each plot shows the data and model predictions with credible intervals -#' (95% by default). -#' -#' If only old data, i.e. the data on which the model was fit, is provided, -#' the projections will be compared against that. If new data, i.e. data -#' which the model has not seen, is also provided, projections will compared -#' against that instead. In the latter case, the fit may not be as good, and -#' credible intervals will be wider due to extra uncertainty over the values -#' of year-specific parameters. -#' -#' If no start date is provided, it is assumed the exact incident and -#' cumulative uptake is known for each time t-1 when incident uptake for -#' time t is being predicted. This evaluates how well the model fits -#' each individual data point through the entire time series. -#' -#' If a start date is provided, it is assumed that the exact incident and -#' cumulative uptake is known only until that date. After that date, -#' uncertainty propagates through time, as if actually projecting the future. -#' -#' If an end date is not provided, projections will be provided through -#' the end date of the provided data set. -#' -#' If an end date is provided, projections will be provided through the -#' closest expected reporting date to the end date, even if this extrapolates -#' beyond the end of the provided data set. -#' -#' UPDATES FOR THE FUTURE: -#' - Model must incorporate year-to-year variability -#' - Model must incorporate state-to-state variability -#' - Multiple models can be provided, with scaling, to pool projections -#' -#' @description Evaluate a model that projects uptake into the future -#' -#' @param old_data uptake data used to fit the projection model -#' @param model model fit to project uptake into the future -#' @param new_data uptake data to compare the projection model against -#' @param start_date which date to perform projection from, in %Y-%m-%d format -#' @param end_date which date to perform projections until, in %Y-%m-%d format -#' @param conf width of credible interval to plot -#' -#' @return data frame of the data underlying diagnostic plots -#' -#' @export -#' -evaluate_projection <- function(old_data, model, new_data = NULL, - start_date = NULL, end_date = NULL, - conf = 0.95) { - - model_rows <- (nrow(old_data) - nrow(model$data) + 1):(nrow(old_data)) - - daily_mean <- mean(old_data$daily[model_rows]) - daily_sd <- sd(old_data$daily[model_rows]) - previous_mean <- mean(old_data$daily[model_rows - 1]) - previous_sd <- sd(old_data$daily[model_rows - 1]) - elapsed_mean <- mean(old_data$elapsed[model_rows]) - elapsed_sd <- sd(old_data$elapsed[model_rows]) - - if (is.null(new_data)) { - output <- old_data - } else { - output <- new_data - } - - output$previous <- c(NA, output$daily[1:(nrow(output) - 1)]) - output$interval <- c(output$elapsed[1], diff(output$elapsed)) - - if (!is.null(end_date)) { - end_date <- as.Date(end_date) - if (end_date <= output$date[nrow(output)]) { - output <- output[output$date <= end_date, ] - } else { - extra_dates <- seq(output$date[nrow(output)], end_date, - by = round(mean(output$interval)))[-1] - extra_rows <- nrow(output) + seq_along(length(extra_dates)) - output[extra_rows, ] <- NA - output$date[extra_rows] <- extra_dates - output$elapsed <- as.numeric(output$date - - (output$date[1] - output$elapsed[1])) - output$previous[extra_rows[1]] <- output$daily[extra_rows[1] - 1] - } - } - - output$incident_mean <- rep(NA, nrow(output)) - output$incident_upper <- rep(NA, nrow(output)) - output$incident_lower <- rep(NA, nrow(output)) - output$cumulative_mean <- rep(NA, nrow(output)) - output$cumulative_upper <- rep(NA, nrow(output)) - output$cumulative_lower <- rep(NA, nrow(output)) - - if (!is.null(start_date)) { - start_date <- as.Date(start_date) - early_dates <- which(output$date <= start_date | is.na(output$previous)) - } else { - early_dates <- which(is.na(output$previous)) - } - output$incident_mean[early_dates] <- output$incident[early_dates] - output$incident_upper[early_dates] <- output$incident[early_dates] - output$incident_lower[early_dates] <- output$incident[early_dates] - output$cumulative_mean[early_dates] <- output$cumulative[early_dates] - output$cumulative_upper[early_dates] <- output$cumulative[early_dates] - output$cumulative_lower[early_dates] <- output$cumulative[early_dates] - - output_copy <- output[is.na(output$incident_mean), ] - - if (is.null(start_date)) { - input <- data.frame(elapsed_std = (output_copy$elapsed - elapsed_mean) / - elapsed_sd, - previous_std = (output_copy$previous - previous_mean) / - previous_sd) - - post_check <- brms::posterior_predict(model, newdata = input) * - daily_sd + daily_mean - } else { - output_copy <- output[is.na(output$incident_mean), ] - - post_check <- matrix(NA, nrow = brms::ndraws(model), - ncol = nrow(output_copy)) - - for (i in seq_len(nrow(output_copy))) { - if (i == 1) { - input <- data.frame(elapsed_std = (output_copy$elapsed[i] - - elapsed_mean) / elapsed_sd, - previous_std = (output_copy$previous[i] - - previous_mean) / previous_sd) - post_check[, i] <- as.vector(brms::posterior_predict(model, - newdata = input)) - } else { - input <- data.frame(elapsed_std = (output_copy$elapsed[i] - - elapsed_mean) / elapsed_sd, - previous_std = (post_check[, i - 1] - - previous_mean) / previous_sd) - post_check[, i] <- as.vector(brms::posterior_predict(model, - newdata = input, - ndraws = 1)) - } - post_check[, i] <- post_check[, i] * daily_sd + daily_mean - } - } - - post_check <- sweep(post_check, 2, output_copy$interval, FUN = `*`) - - output_copy$incident_mean <- colMeans(post_check) - output_copy$incident_upper <- apply(post_check, 2, quantile, - 1 - (1 - conf) / 2) - output_copy$incident_lower <- apply(post_check, 2, quantile, - (1 - conf) / 2) - - if (is.null(start_date)) { - post_check <- sweep(post_check, 2, - (output_copy$cumulative - output_copy$incident), - FUN = `+`) - } else { - post_check <- t(apply(post_check, 1, cumsum)) - post_check <- sweep(post_check, 2, (output_copy$cumulative[1] - - output_copy$incident[1]), - FUN = `+`) - } - - output_copy$cumulative_mean <- colMeans(post_check) - output_copy$cumulative_upper <- apply(post_check, 2, quantile, - 1 - (1 - conf) / 2) - output_copy$cumulative_lower <- apply(post_check, 2, quantile, - (1 - conf) / 2) - - output <- rbind(output[!is.na(output$incident_mean), ], output_copy) - - output$interval <- NULL - output$previous <- NULL - - incident_plot <- ggplot2::ggplot(output) + - ggplot2::geom_ribbon(ggplot2::aes(x = date, ymax = incident_upper, - ymin = incident_lower), - fill = "black", alpha = 0.25) + - ggplot2::geom_line(ggplot2::aes(x = date, y = incident), - color = "dodgerblue", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = incident_mean), - color = "black", linewidth = 1.5) + - ggplot2::theme_bw() + - ggplot2::xlab("Time") + ggplot2::ylab("% of Population") + - ggplot2::theme(text = ggplot2::element_text(size = 20)) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.9 * diff(range(output$incident_upper)), - label = "Input", col = "dodgerblue", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.75 * diff(range(output$incident_upper)), - label = "Model", col = "black", size = 10) - print(incident_plot) - - cumulative_plot <- ggplot2::ggplot(output) + - ggplot2::geom_ribbon(ggplot2::aes(x = date, ymax = cumulative_upper, - ymin = cumulative_lower), - fill = "black", alpha = 0.25) + - ggplot2::geom_line(ggplot2::aes(x = date, y = cumulative), - color = "dodgerblue", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = cumulative_mean), - color = "black", linewidth = 1.5) + - ggplot2::theme_bw() + - ggplot2::xlab("Time") + ggplot2::ylab("% of Population") + - ggplot2::theme(text = ggplot2::element_text(size = 20)) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.25 * diff(range(output$cumulative_upper)), - label = "Input", col = "dodgerblue", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.1 * diff(range(output$cumulative_upper)), - label = "Model", col = "black", size = 10) - print(cumulative_plot) - - return(output) -} \ No newline at end of file diff --git a/R/evaluate_scaling.r b/R/evaluate_scaling.r deleted file mode 100644 index d29b003..0000000 --- a/R/evaluate_scaling.r +++ /dev/null @@ -1,107 +0,0 @@ -#' Evaluate a scaling model -#' -#' To evaluate how successfully a scaling model scales one uptake data set -#' onto another, plots of the incident and cumulative uptake are generated. -#' Each plot shows the input data, the target data, and the model predictions -#' and credible interval (95% by default). -#' -#'UPDATES FOR THE FUTURE: -#' - Model must incorporate year-to-year variability -#' -#' @description Evaluate a model that scales one uptake data set to another -#' -#' @param input uptake data to use as a predictor -#' @param target uptake data to use as the outcome -#' @param model model fit to scale input to target -#' @param conf width of credible interval to plot -#' -#' @return data frame of the data underlying diagnostic plots -#' -#' @export -#' -evaluate_scaling <- function(input, target, model, conf = 0.95) { - - output <- dplyr::inner_join(input, target, - by = c("state", "date", "elapsed", "year")) - output <- output[order(output$date), ] - colnames(output) <- gsub("\\.x", "_input", colnames(output)) - colnames(output) <- gsub("\\.y", "_target", colnames(output)) - - if (nrow(output) == (nobs(model) + 1)) { - intervals <- diff(output$elapsed) - output <- output[-1, ] - } else { - intervals <- c(output$elapsed[1], diff(output$elapsed)) - } - - known <- output$cumulative_target - output$incident_target - - post_check <- brms::posterior_predict(model) * - sd(output$daily_target) + mean(output$daily_target) - - post_check <- sweep(post_check, 2, intervals, FUN = `*`) - - output$incident_pred_mean <- colMeans(post_check) - output$incident_pred_upper <- apply(post_check, 2, quantile, - 1 - (1 - conf) / 2) - output$incident_pred_lower <- apply(post_check, 2, quantile, - (1 - conf) / 2) - - post_check <- sweep(post_check, 2, known, FUN = `+`) - - output$cumulative_pred_mean <- colMeans(post_check) - output$cumulative_pred_upper <- apply(post_check, 2, quantile, - 1 - (1 - conf) / 2) - output$cumulative_pred_lower <- apply(post_check, 2, quantile, - (1 - conf) / 2) - - incident_plot <- ggplot2::ggplot(output) + - ggplot2::geom_ribbon(ggplot2::aes(x = date, ymax = incident_pred_upper, - ymin = incident_pred_lower), - fill = "black", alpha = 0.25) + - ggplot2::geom_line(ggplot2::aes(x = date, y = incident_input), - color = "dodgerblue", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = incident_target), - color = "darkred", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = incident_pred_mean), - color = "black", linewidth = 1.5) + - ggplot2::theme_bw() + - ggplot2::xlab("Time") + ggplot2::ylab("% of Population") + - ggplot2::theme(text = ggplot2::element_text(size = 20)) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.9 * diff(range(output$incident_pred_upper)), - label = "Input", col = "dodgerblue", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.75 * diff(range(output$incident_pred_upper)), - label = "Target", col = "darkred", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.6 * diff(range(output$incident_pred_upper)), - label = "Model", col = "black", size = 10) - print(incident_plot) - - cumulative_plot <- ggplot2::ggplot(output) + - ggplot2::geom_ribbon(ggplot2::aes(x = date, ymax = cumulative_pred_upper, - ymin = cumulative_pred_lower), - fill = "black", alpha = 0.25) + - ggplot2::geom_line(ggplot2::aes(x = date, y = cumulative_input), - color = "dodgerblue", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = cumulative_target), - color = "darkred", linewidth = 1.5) + - ggplot2::geom_line(ggplot2::aes(x = date, y = cumulative_pred_mean), - color = "black", linewidth = 1.5) + - ggplot2::theme_bw() + - ggplot2::xlab("Time") + ggplot2::ylab("% of Population") + - ggplot2::theme(text = ggplot2::element_text(size = 20)) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.4 * diff(range(output$cumulative_pred_upper)), - label = "Input", col = "dodgerblue", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.25 * diff(range(output$cumulative_pred_upper)), - label = "Target", col = "darkred", size = 10) + - ggplot2::annotate("text", x = output$date[round(0.9 * nrow(output))], - y = 0.1 * diff(range(output$cumulative_pred_upper)), - label = "Model", col = "black", size = 10) - print(cumulative_plot) - - return(output) -} \ No newline at end of file diff --git a/R/generate_projections.R b/R/generate_projections.R deleted file mode 100644 index b84cb27..0000000 --- a/R/generate_projections.R +++ /dev/null @@ -1,207 +0,0 @@ -library(dplyr) -library(ggplot2) - -### functions from bottom to top: -# inv_z_scale() -# data_prepare() -# build_projection_model2() -# -# IS IN -# -# real_time_projection() -> plot_projection() -# -# IS IN -# -# plot_real_time_projection() - -# real_time_projection() -> get_mspe() -# -# IS IN -# -# evaluate_real_time_projection() - - -#### inverse Z score ##### -#' @description Inversely transform z-score standardized data to raw data -#' using: raw = z*sd + mean -#' -#' @param old_data: the previous data that is used for z-score standardization -#' @param scaled_data: current scaled data -#' @return inversely z-score transformed data -inv_z_scale <- function(old_data, scaled_data) { - sd <- sd(old_data) - m <- mean(old_data) - - scaled_data * sd + m -} - - -#' @description Data format preparation for model building. -#' It includes: -#' 1. Add `previous` predictor. -#' 2. Z-score standardization of `previous`,`elapsed`, and `daily` -#' @param data: the data frame need to be prepared. -#' Must include columns:`previous`,`elapsed`, and `daily` -#' @return Data frame with scaled `previous`,`elapsed`, and `daily` and ready to build model -data_prepare <- function(data) { - data$previous <- c(NA, data$daily[-nrow(data)]) # nolint - data <- data[-1, ] - data$previous_std <- scale(data$previous, center = T, scale = T) - data$elapsed_std <- scale(data$elapsed, center = T, scale = T) - data$daily_std <- scale(data$daily, center = T, scale = T) - - return(data) -} - -#' @description This function uses `train_data` to train the linear regression -#' and project to the last date in the `test_data`. -#' It is embedded in `plot_real_time_projection` and 'evaluate_real_time_projection' -#' @param train_data: data frame used to train linear regression -#' @param test_data: data frame used to compare with the prediction generated by linear regression -#' @return a list containing the predicted uptake and the corresponding -#' observed rate under the `test_data` time frame, -#' in both uptake rate `rate` and cumulative uptake `cumulative` - -real_time_projection <- function( - train_data = nis_usa_2022, - test_data = nis_usa_2023) { - # transform the train data and test data to the format of model building and model prediction # - scaled_train <- data_prepare(train_data) - scaled_test <- data_prepare(test_data) - - # The first row is also removed for raw test data, to be comparable with the predicted values # - test_data <- test_data[-1, ] - - # model building # - model <- build_projection_model2(data = scaled_train) - - ## get out-of-sample prediction ## - scaled_pred <- brms::posterior_predict(model, newdata = scaled_test) - - ## recover the prediction to the original format for comparision ## - pred <- inv_z_scale(test_data$daily, scaled_pred) - - # predicted daily vaccine uptake # - pred_summary <- data.frame( - date = test_data$date, - obs = test_data$daily, - lower = apply(pred, 2, quantile, 0.025), - upper = apply(pred, 2, quantile, 0.975), - mean = colMeans(pred) - ) - - # predicted cumulative vaccine uptake # - cumu <- t(apply(pred, 1, cumsum)) - cumu_summary <- data.frame( - mean = colMeans(cumu), - obs = test_data$cumulative, - lower = apply(cumu, 2, quantile, 0.025), - upper = apply(cumu, 2, quantile, 0.975), - date = test_data$date - ) - - return(list(rate = pred_summary, cumulative = cumu_summary)) -} - -#' @description This function plots prediction with observed data -#' This function is embeded in `plot_real_time_projection` -#' @param option: which uptake to plot: 'rate' or 'cumulative' -#' @param predicted: out-of-sample prediction from linear regression -#' @param test: the observed data corresponding to the prediction -#' @return a ggplot object - -plot_projection <- function(option, predicted, test) { - if (option == "rate") { - data_y_name <- "daily" - } else if (option == "cumulative") { - data_y_name <- "cumulative" - } else { - stop("Response variable in test is not found.") - } - - ggplot(predicted) + - geom_ribbon(aes(x = date, ymin = lower, ymax = upper), - fill = "black", alpha = 0.25 - ) + - geom_line(aes(x = date, y = mean), # Bug warning! colnames may change later # - linewidth = 1.5 - ) + - geom_line(aes(x = date, y = daily), - data = test, - linewidth = 1.5, color = "dodgerblue" - ) + - xlab("Time") + - ylab("% of Population") + - ggtitle(paste("Forecast starts at", min(test$date))) + - theme_bw() + - theme(text = ggplot2::element_text(size = 20)) + - annotate("text", - x = test$date[round(0.9 * nrow(test))], - y = 1.7 * diff(range(predicted$upper)), - label = "Input", col = "dodgerblue", size = 5 - ) + - annotate("text", - x = test$date[round(0.9 * nrow(test))], - y = 1.6 * diff(range(predicted$upper)), - label = "Model", col = "black", size = 5 - ) -} - - -#' @description This function fits the linear regression and plots the prediction -#' with the observed data in a time-varying manner -#' @param data_option: which uptake to plot: 'rate' or 'cumulative' -#' @param all_data: Data used to iteratively train and test, -#' with each iteration moving forward one data point -#' @param min_data_size: the minimum data size required for model training, -#' So far, it is the same for model testing. -#' @return a X*Y grid ggplot - -plot_real_time_projection <- function( - data_option = "rate", - all_data, min_data_size = 8) { - start_date <- sort(all_data$date)[1 + min_data_size - 1] - end_date <- sort(all_data$date)[nrow(all_data) - min_data_size + 1] - - date_series <- all_data %>% - filter(date >= start_date, date <= end_date) %>% - select(date) - - # convert single-column data.frame to vector # - date_series <- as.vector(date_series$date) + as.Date("1970-01-01") - ps <- list() - i <- 1 - - for (split_date in date_series) { - train_data <- all_data %>% - filter(date < split_date) - - test_data <- all_data %>% - filter(date >= split_date) - - output <- real_time_projection( - train_data = train_data, - test_data = test_data - ) - - if (data_option == "rate") { - data_to_plot <- output$rate - } else if (data_option == "cumulative") { - data_to_plot <- output$cumulative - } else { - stop("Data_option is not valid.") - } - - ## plot ## - plot_projection( - option = data_option, - predicted = data_to_plot, - test = test_data - ) -> p - - ps[[i]] <- p - i <- i + 1 - } - - cowplot::plot_grid(plotlist = ps, nrow = ceiling(sqrt(length(date_series)))) -} diff --git a/R/get_uptake_data.r b/R/get_uptake_data.r deleted file mode 100644 index 9f01d25..0000000 --- a/R/get_uptake_data.r +++ /dev/null @@ -1,153 +0,0 @@ -#' Format uptake data -#' -#' Download a local or web .csv file of cumulative uptake percentages. -#' You must name the columns that contain the state, date, and uptake. -#' -#' Provide a .csv with state abbreviations in the first column and full -#' names in the second. States will be renamed to their abbreviations, -#' and states missing from the .csv will be discarded. -#' -#' Also provide the format of the dates in the date column. Use the default -#' if there are multiple columns and/or the dates are specified non-numerically. -#' Provide the first date vaccines were available to calculate elapsed time. -#' -#' Optionally, provide filters to remove certain rows from the data. -#' Filters are a list of vectors. In each vector, the first entry gives the name -#' of the column to filter on. Subsequent entries give acceptable values in that -#' column. Rows with unmentioned values will be removed. -#' -#' Incident percentage uptake is calculated from the cumulative uptake. -#' Therefore, if the time intervals are not equivalent between all cumulative -#' reports, a warning will be generated. -#' -#' The uptake data is returned in a data frame with seven columns: -#' -#' Column 1 is the state for which percentage cumulative uptake is reported. -#' Column 2 is the date at which percentage cumulative uptake is reported. -#' Column 3 is the days elapsed since the the vaccine became available. -#' Column 4 is the year of reporting during the fall portion of the season. -#' Column 5 is the percentage cumulative uptake. -#' Column 6 is the incident uptake since the previous report. -#' Column 7 is the incident uptake averaged daily since the previous report. -#' -#' @description Download uptake data and put it into a common format -#' -#' @param file_name file name or url address where the data is located -#' @param state name of column containing the state, or the value to fill in -#' @param date name of column(s) containing the date, or the value to fill in -#' @param cumulative name of column containing cumulative uptake -#' @param state_key file name of .csv with abbr and full name of desired states -#' @param date_format format of the date in its column -#' @param start_date date when vaccines were first available -#' @param filters list of vectors giving column names and values to filter on -#' -#' @return formatted uptake data -#' -#' @export -#' -get_uptake_data <- function(file_name, state, date, cumulative, - state_key = NULL, date_format = "%m/%d/%Y", - start_date = NULL, filters = NULL) { - data <- read.csv(file_name) - - if (!is.null(filters)) { - for (i in seq_along(filters)) { - if (filters[[i]][1] %in% colnames(data)) { - data <- data[data[, filters[[i]][1]] %in% filters[[i]][-1], ] - } else { - stop(paste("There is no column called ", filters[[i]][1], sep = "")) - } - } - } - - if (cumulative %in% colnames(data)) { - data <- data[!is.na(data[, cumulative]), ] - cuml_col <- data[, cumulative] - } else { - stop(paste("There is no column called ", cumulative, sep = "")) - } - - if (state %in% colnames(data)) { - state_col <- data[, state] - } else { - state_col <- rep_len(state, nrow(data)) - } - - if (!is.null(state_key)) { - state_key <- read.csv(state_key) - state_idx <- match(state_col, state_key[, 2], nomatch = 0) + - match(state_col, state_key[, 1], nomatch = 0) - state_idx[state_idx == 0] <- NA - state_col <- state_key[, 1][state_idx] - state_col <- state_col[!is.na(state_col)] - data <- data[!is.na(state_idx), ] - cuml_col <- cuml_col[!is.na(state_idx)] - } - - if (all(date %in% colnames(data))) { - date_col <- data[, date] - if (length(date) > 1) { - for (i in seq_along(date)) { - if (any(grepl("^[A-Za-z]", date_col[, i]))) { - date_col[, i] <- sub(".*-", "", date_col[, i]) - date_col[, i] <- trimws(date_col[, i], "both") - date_col[, i] <- sub(" ", "/", date_col[, i]) - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "January", "1") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "February", "2") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "March", "3") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "April", "4") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "May", "5") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "June", "6") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "July", "7") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "August", "8") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "September", "9") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "October", "10") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "November", "11") - date_col[, i] <- stringr::str_replace_all(date_col[, i], - "December", "12") - } - } - date_col <- do.call(paste, c(date_col, sep = "/")) - } - } else { - date_col <- rep_len(date, nrow(data)) - } - date_col <- as.Date(date_col, format = date_format) - - elapsed_col <- as.numeric(date_col - as.Date(start_date, - format = date_format)) - - year_col <- as.numeric(format(date_col, "%Y")) - - 1 * (as.numeric(format(date_col, "%m")) < 7) - - data <- data.frame(state = state_col, - date = date_col, - elapsed = elapsed_col, - year = year_col, - cumulative = cuml_col) - data <- data[order(data$date), ] - - data$incident <- ave(data$cumulative, data$state, - FUN = function(x) {c(x[1], diff(x))}) - - data$daily <- data$incident / c(data$elapsed[1], diff(data$elapsed)) - - check_intervals <- ave(data$elapsed, data$state, - FUN = function(x) {c(x[1], diff(x))}) - if (length(unique(check_intervals)) > 1) { - warning("Not all time intervals are the same for incident uptake!") - } - - return(data) -} diff --git a/R/interpolate_dates.r b/R/interpolate_dates.r deleted file mode 100644 index b5b7e28..0000000 --- a/R/interpolate_dates.r +++ /dev/null @@ -1,44 +0,0 @@ -#' Interpolate to match dates across data sets -#' -#' BEWARE: THIS DOES NOT YET ACCOUNT FOR MULTI-YEAR DATA SETS -#' -#' Dates may be offset between two data sets. To make the dates match, -#' linear interpolation is performed on the input data set to align its -#' dates with the target data set. Only target dates occurring between -#' two input dates are included. Target dates outside the input range are -#' discarded. -#' -#' @description Interpolate one uptake data set to match dates with another -#' -#' @param input uptake data on which to perform interpolation -#' @param target uptake data with the dates to match -#' -#' @return interpolated uptake data -#' -#' @export -#' -interpolate_dates <- function(input, target) { - output <- target - output$cumulative <- rep(NA, nrow(output)) - - for (i in seq_len(nrow(target))) { - input_sub <- input[input$state == target$state[i], ] - time_diff <- target$date[i] - input_sub$date - time_idx <- order(abs(time_diff))[1:2] - time_diff <- time_diff[time_idx] - if (sum(time_diff < 0) == 1) { - input_sub <- input_sub[time_idx, ] - weights <- as.numeric(abs(time_diff)) / as.numeric(sum(abs(time_diff))) - output$cumulative[i] <- sum(input_sub$cumulative * rev(weights)) - } - } - - output <- output[!is.na(output$cumulative), ] - - output$incident <- ave(output$cumulative, output$state, - FUN = function(x) {c(x[1], diff(x))}) - - output$daily <- output$incident / c(output$elapsed[1], diff(output$elapsed)) - - return(output) -} diff --git a/R/retrospective_evaluate.r b/R/retrospective_evaluate.r deleted file mode 100644 index 679503d..0000000 --- a/R/retrospective_evaluate.r +++ /dev/null @@ -1,82 +0,0 @@ -library(dplyr) - -# Retrospective evaluation of forecast -# MSPE only in this version. -# Update in the future: -# 1. Public health oriented metrics: end-of-season totals, high demand period, etc. -# 2. Probabilistic metrics: QS, WIS, etc. - - -#' @description Get mean-squared prediction error -#' @param data: observed data, can be a constant or a vector -#' @param pred: the prediction output from models, can be a constant or a vector -#' @return MSPE -get_mspe <- function(data, pred) { - mean((data - pred)^2) -} - -#' @description Retrospectively evaluate the forecast given the test data -#' @param data_option: which uptake data type to evaluate? 'rate' or 'cumulative' -#' @param evaluate_option: which metrics to use? 'mspe' only so far. -#' Adding this indicates changing the output from the embedded functions. -#' @param all_data: Data used to iteratively train and test, -#' with each iteration moving forward one data point -#' @param min_data_size: the minimum data size required for model training, -#' So far, this number is the same for model testing. -#' @return a data frame with MSPE given different initializing date - -evaluate_real_time_projection <- function( - data_option = "rate", - evaluate_option = "mspe", - all_data, min_data_size = 8) { - start_date <- sort(all_data$date)[1 + min_data_size - 1] - end_date <- sort(all_data$date)[nrow(all_data) - min_data_size + 1] - - date_series <- all_data %>% - filter(date >= start_date, date <= end_date) %>% - select(date) - - # convert single-column data.frame to vector # - date_series <- as.vector(date_series$date) + as.Date("1970-01-01") - metrics <- data.frame() - - for (split_date in date_series) { - train_data <- all_data %>% - filter(date < split_date) - - test_data <- all_data %>% - filter(date >= split_date) - - output <- real_time_projection( - train_data = train_data, - test_data = test_data - ) - - if (data_option == "rate") { - data_to_eval <- output$rate - } else if (data_option == "cumulative") { - data_to_eval <- output$cumulative - } else { - stop("Data_option is not valid.") - } - - ## evaluation ## - - # MSPE # - # So far, only use MSPE. - # Note: What can be the code structure if multiple metrics are evaluated simultaneously - if (evaluate_option == "mspe") { - metric <- get_mspe(data_to_eval$obs, data_to_eval$mean) - } - - metrics <- rbind( - metrics, - data.frame( - forecast_date = split_date, - mspe = metric - ) - ) - } - - return(metrics) -} diff --git a/scripts/evaluation_script.r b/scripts/evaluation_script.r deleted file mode 100644 index 3242498..0000000 --- a/scripts/evaluation_script.r +++ /dev/null @@ -1,58 +0,0 @@ -rm(list = ls()) -# read all the R files under R folder # -R_path <- "R/" -purrr::map(paste0(R_path, list.files("R/")), source) - -# load observed data # - -# Load 2022 NIS data for USA -nis_usa_2022 <- get_uptake_data( - "https://data.cdc.gov/api/views/akkj-j5ru/rows.csv?accessType=DOWNLOAD", - state = "Geography", - date = c("Time.Period", "Year"), - cumulative = "Estimate....", - state_key = "data/USA_Name_Key.csv", - date_format = "%m/%d/%Y", - start_date = "9/2/2022", - filters = list(c( - "Indicator.Category", - "Received updated bivalent booster dose (among adults who completed primary series)" - )) -) - -# Load 2023 NIS data for USA -nis_usa_2023 <- get_uptake_data( - "data/NIS_2023-24.csv", - state = "geography", - date = "date", - cumulative = "estimate", - state_key = "data/USA_Name_Key.csv", - date_format = "%m/%d/%Y", - start_date = "9/12/2023", - filters = list(c("time_type", "Weekly"), c("group_name", "Overall")) -) - - -# generate projections initiated at different time # -all_data <- rbind(nis_usa_2022, nis_usa_2023) - -# make sure there at least 4 data points for model training: 30 min to run # -plot_real_time_projection( - data_option = "rate", - all_data = all_data -) - -# evaluate MSPE between projections and data initiated at different time: 30 min to run # -evaluate_real_time_projection(evaluate_option = "mspe", data_option = "rate", all_data = all_data) -> mspe_df - -mspe_df %>% - mutate(forecast_date = forecast_date + as.Date("1970-01-01")) %>% - filter(mspe < 1) %>% - ggplot() + - geom_point(aes(x = forecast_date, y = mspe)) + - xlab("Forecast date") + - ylab("MSPE") + - theme_bw() -ggsave("mspe.jpeg", width = 4, height = 3, units = "in") -# Note for future upgrade: seperate model fitting from plotting and evaluation. -# Repeatedly fitting the model to plot and evaluate is not time-efficient. diff --git a/scripts/projection_script.r b/scripts/projection_script.r deleted file mode 100644 index 1201d02..0000000 --- a/scripts/projection_script.r +++ /dev/null @@ -1,65 +0,0 @@ -setwd("/home/tec0/covid_booster_uptake") -source("get_uptake_data.r") -source("interpolate_dates.r") -source("build_scaling_model.r") -source("evaluate_scaling.r") -source("build_projection_model.r") -source("evaluate_projection.r") -source("projection_functions.r") - -# Load 2022 IIS data for USA -iis_usa_2022 <- get_uptake_data( - "https://data.cdc.gov/api/views/unsk-b7fc/rows.csv?accessType=DOWNLOAD", - state = "Location", - date = "Date", - cumulative = "Bivalent_Booster_18Plus_Pop_Pct", - state_key = "USA_Name_Key.csv", - date_format = "%m/%d/%Y", - start_date = "9/2/2022" -) - -# Load 2022 NIS data for USA -nis_usa_2022 <- get_uptake_data( - "https://data.cdc.gov/api/views/akkj-j5ru/rows.csv?accessType=DOWNLOAD", - state = "Geography", - date = c("Time.Period", "Year"), - cumulative = "Estimate....", - state_key = "USA_Name_Key.csv", - date_format = "%m/%d/%Y", - start_date = "9/2/2022", - filters = list(c("Indicator.Category", - "Received updated bivalent booster dose (among adults who completed primary series)")) -) - -# Load 2023 NIS data for USA -nis_usa_2023 <- get_uptake_data( - "NIS_2023-24.csv", - state = "geography", - date = "date", - cumulative = "estimate", - state_key = "USA_Name_Key.csv", - date_format = "%m/%d/%Y", - start_date = "9/12/2023", - filters = list(c("time_type", "Weekly"), c("group_name", "Overall")) -) - -# Interpolate 2022 IIS dates to match 2022 NIS -iis_usa_2022_interp <- interpolate_dates(iis_usa_2022, nis_usa_2022) - -# Build and evaluate a model to scale 2022 IIS to 2022 NIS -iis_to_nis_model <- build_scaling_model(iis_usa_2022_interp, nis_usa_2022) -evaluate_scaling(iis_usa_2022_interp, nis_usa_2022, iis_to_nis_model) - -# Build and evaluate a model to project 2022 IIS data forward -iis_forward_model <- build_projection_model(iis_usa_2022) -evaluate_projection(iis_usa_2022, iis_forward_model) -evaluate_projection(iis_usa_2022, iis_forward_model, start_date = "2022-10-19") - -# Build and evaluate a model to project 2022 NIS data forward -nis_forward_model <- build_projection_model(nis_usa_2022) -evaluate_projection(nis_usa_2022, nis_forward_model) -evaluate_projection(nis_usa_2022, nis_forward_model, start_date = "2022-09-10") - -# Does the 2022 NIS projection model work on the 2023 NIS data? -evaluate_projection(nis_usa_2022, nis_forward_model, nis_usa_2023) -evaluate_projection(nis_usa_2022, nis_forward_model, nis_usa_2023, start_date = "2022-09-30")