Evaluation code (R) (#15)

Adapts #5 to include only R code (or #9 maybe) --------- Co-authored-by: Fuhan Yang <[email protected]> Co-authored-by: Fuhan-Yang <[email protected]>
CDCgov · Oct 23, 2024 · 0580abd · 0580abd
1 parent 4bc7638
commit 0580abd
Show file tree

Hide file tree

Showing 4 changed files with 356 additions and 13 deletions.
diff --git a/R/build_projection_model.r → R/build_projection_model_opt_plot.r b/R/build_projection_model.r → R/build_projection_model_opt_plot.r
@@ -31,8 +31,7 @@
 #'
 #' @export
 #'
-build_projection_model <- function(data, drop_first = NULL) {
-
+build_projection_model2 <- function(data, drop_first = NULL, output = F) {
   if (is.null(drop_first)) {
     intervals <- diff(data$elapsed)
     if (abs(data$elapsed[1] - mean(intervals)) > (sd(intervals) + 1)) {
@@ -47,18 +46,15 @@ build_projection_model <- function(data, drop_first = NULL) {
     data <- data[-1, ]
   }
 
-  data$previous <- c(NA, data$daily[-nrow(data)])
-  data <- data[-1, ]
-  data$previous_std <- (data$previous - mean(data$previous)) / sd(data$previous)
-  data$elapsed_std <- (data$elapsed - mean(data$elapsed)) / sd(data$elapsed)
-  data$daily_std <- (data$daily - mean(data$daily)) / sd(data$daily)
-
   model <- brms::brm(daily_std ~ previous_std * elapsed_std,
-                     data = data,
-                     family = gaussian())
+    data = data,
+    family = gaussian()
+  )
 
-  print(summary(model))
-  plot(model)
+  if (output) {
+    print(summary(model))
+    plot(model)
+  }
 
   return(model)
-}
+}
diff --git a/R/generate_projections.R b/R/generate_projections.R
@@ -0,0 +1,207 @@
+library(dplyr)
+library(ggplot2)
+
+### functions from bottom to top:
+# inv_z_scale()
+# data_prepare()
+# build_projection_model2()
+#
+#           IS IN
+#
+# real_time_projection() -> plot_projection()
+#
+#                       IS IN
+#
+# plot_real_time_projection()
+
+# real_time_projection() -> get_mspe()
+#
+#                    IS IN
+#
+# evaluate_real_time_projection()
+
+
+#### inverse Z score #####
+#' @description Inversely transform z-score standardized data to raw data
+#'              using: raw = z*sd + mean
+#'
+#' @param old_data: the previous data that is used for z-score standardization
+#' @param scaled_data: current scaled data
+#' @return inversely z-score transformed data
+inv_z_scale <- function(old_data, scaled_data) {
+    sd <- sd(old_data)
+    m <- mean(old_data)
+
+    scaled_data * sd + m
+}
+
+
+#' @description Data format preparation for model building.
+#' It includes:
+#' 1. Add `previous` predictor.
+#' 2. Z-score standardization of `previous`,`elapsed`, and `daily`
+#' @param data: the data frame need to be prepared.
+#' Must include columns:`previous`,`elapsed`, and `daily`
+#' @return Data frame with scaled `previous`,`elapsed`, and `daily` and ready to build model
+data_prepare <- function(data) {
+    data$previous <- c(NA, data$daily[-nrow(data)]) # nolint
+    data <- data[-1, ]
+    data$previous_std <- scale(data$previous, center = T, scale = T)
+    data$elapsed_std <- scale(data$elapsed, center = T, scale = T)
+    data$daily_std <- scale(data$daily, center = T, scale = T)
+
+    return(data)
+}
+
+#' @description This function uses `train_data` to train the linear regression
+#' and project to the last date in the `test_data`.
+#' It is embedded in `plot_real_time_projection` and 'evaluate_real_time_projection'
+#' @param train_data: data frame used to train linear regression
+#' @param test_data: data frame used to compare with the prediction generated by linear regression
+#' @return a list containing the predicted uptake and the corresponding
+#' observed rate under the `test_data` time frame,
+#' in both uptake rate `rate` and cumulative uptake `cumulative`
+
+real_time_projection <- function(
+    train_data = nis_usa_2022,
+    test_data = nis_usa_2023) {
+    # transform the train data and test data to the format of model building and model prediction #
+    scaled_train <- data_prepare(train_data)
+    scaled_test <- data_prepare(test_data)
+
+    # The first row is also removed for raw test data, to be comparable with the predicted values #
+    test_data <- test_data[-1, ]
+
+    # model building #
+    model <- build_projection_model2(data = scaled_train)
+
+    ## get out-of-sample prediction ##
+    scaled_pred <- brms::posterior_predict(model, newdata = scaled_test)
+
+    ## recover the prediction to the original format for comparision ##
+    pred <- inv_z_scale(test_data$daily, scaled_pred)
+
+    # predicted daily vaccine uptake #
+    pred_summary <- data.frame(
+        date = test_data$date,
+        obs = test_data$daily,
+        lower = apply(pred, 2, quantile, 0.025),
+        upper = apply(pred, 2, quantile, 0.975),
+        mean = colMeans(pred)
+    )
+
+    # predicted cumulative vaccine uptake #
+    cumu <- t(apply(pred, 1, cumsum))
+    cumu_summary <- data.frame(
+        mean = colMeans(cumu),
+        obs = test_data$cumulative,
+        lower = apply(cumu, 2, quantile, 0.025),
+        upper = apply(cumu, 2, quantile, 0.975),
+        date = test_data$date
+    )
+
+    return(list(rate = pred_summary, cumulative = cumu_summary))
+}
+
+#' @description This function plots prediction with observed data
+#' This function is embeded in `plot_real_time_projection`
+#' @param option: which uptake to plot: 'rate' or 'cumulative'
+#' @param predicted: out-of-sample prediction from linear regression
+#' @param test: the observed data corresponding to the prediction
+#' @return a ggplot object
+
+plot_projection <- function(option, predicted, test) {
+    if (option == "rate") {
+        data_y_name <- "daily"
+    } else if (option == "cumulative") {
+        data_y_name <- "cumulative"
+    } else {
+        stop("Response variable in test is not found.")
+    }
+
+    ggplot(predicted) +
+        geom_ribbon(aes(x = date, ymin = lower, ymax = upper),
+            fill = "black", alpha = 0.25
+        ) +
+        geom_line(aes(x = date, y = mean), # Bug warning! colnames may change later #
+            linewidth = 1.5
+        ) +
+        geom_line(aes(x = date, y = daily),
+            data = test,
+            linewidth = 1.5, color = "dodgerblue"
+        ) +
+        xlab("Time") +
+        ylab("% of Population") +
+        ggtitle(paste("Forecast starts at", min(test$date))) +
+        theme_bw() +
+        theme(text = ggplot2::element_text(size = 20)) +
+        annotate("text",
+            x = test$date[round(0.9 * nrow(test))],
+            y = 1.7 * diff(range(predicted$upper)),
+            label = "Input", col = "dodgerblue", size = 5
+        ) +
+        annotate("text",
+            x = test$date[round(0.9 * nrow(test))],
+            y = 1.6 * diff(range(predicted$upper)),
+            label = "Model", col = "black", size = 5
+        )
+}
+
+
+#' @description This function fits the linear regression and plots the prediction
+#' with the observed data in a time-varying manner
+#' @param data_option: which uptake to plot: 'rate' or 'cumulative'
+#' @param all_data: Data used to iteratively train and test,
+#' with each iteration moving forward one data point
+#' @param min_data_size: the minimum data size required for model training,
+#' So far, it is the same for model testing.
+#' @return a X*Y grid ggplot
+
+plot_real_time_projection <- function(
+    data_option = "rate",
+    all_data, min_data_size = 8) {
+    start_date <- sort(all_data$date)[1 + min_data_size - 1]
+    end_date <- sort(all_data$date)[nrow(all_data) - min_data_size + 1]
+
+    date_series <- all_data %>%
+        filter(date >= start_date, date <= end_date) %>%
+        select(date)
+
+    # convert single-column data.frame to vector #
+    date_series <- as.vector(date_series$date) + as.Date("1970-01-01")
+    ps <- list()
+    i <- 1
+
+    for (split_date in date_series) {
+        train_data <- all_data %>%
+            filter(date < split_date)
+
+        test_data <- all_data %>%
+            filter(date >= split_date)
+
+        output <- real_time_projection(
+            train_data = train_data,
+            test_data = test_data
+        )
+
+        if (data_option == "rate") {
+            data_to_plot <- output$rate
+        } else if (data_option == "cumulative") {
+            data_to_plot <- output$cumulative
+        } else {
+            stop("Data_option is not valid.")
+        }
+
+        ## plot ##
+        plot_projection(
+            option = data_option,
+            predicted = data_to_plot,
+            test = test_data
+        ) -> p
+
+        ps[[i]] <- p
+        i <- i + 1
+    }
+
+    cowplot::plot_grid(plotlist = ps, nrow = ceiling(sqrt(length(date_series))))
+}
diff --git a/R/retrospective_evaluate.r b/R/retrospective_evaluate.r
@@ -0,0 +1,82 @@
+library(dplyr)
+
+# Retrospective evaluation of forecast
+# MSPE only in this version.
+# Update in the future:
+# 1. Public health oriented metrics: end-of-season totals, high demand period, etc.
+# 2. Probabilistic metrics: QS, WIS, etc.
+
+
+#' @description Get mean-squared prediction error
+#' @param data: observed data, can be a constant or a vector
+#' @param pred: the prediction output from models, can be a constant or a vector
+#' @return MSPE
+get_mspe <- function(data, pred) {
+    mean((data - pred)^2)
+}
+
+#' @description Retrospectively evaluate the forecast given the test data
+#' @param data_option: which uptake data type to evaluate? 'rate' or 'cumulative'
+#' @param evaluate_option: which metrics to use? 'mspe' only so far.
+#' Adding this indicates changing the output from the embedded functions.
+#' @param all_data: Data used to iteratively train and test,
+#' with each iteration moving forward one data point
+#' @param min_data_size: the minimum data size required for model training,
+#' So far, this number is the same for model testing.
+#' @return a data frame with MSPE given different initializing date
+
+evaluate_real_time_projection <- function(
+    data_option = "rate",
+    evaluate_option = "mspe",
+    all_data, min_data_size = 8) {
+    start_date <- sort(all_data$date)[1 + min_data_size - 1]
+    end_date <- sort(all_data$date)[nrow(all_data) - min_data_size + 1]
+
+    date_series <- all_data %>%
+        filter(date >= start_date, date <= end_date) %>%
+        select(date)
+
+    # convert single-column data.frame to vector #
+    date_series <- as.vector(date_series$date) + as.Date("1970-01-01")
+    metrics <- data.frame()
+
+    for (split_date in date_series) {
+        train_data <- all_data %>%
+            filter(date < split_date)
+
+        test_data <- all_data %>%
+            filter(date >= split_date)
+
+        output <- real_time_projection(
+            train_data = train_data,
+            test_data = test_data
+        )
+
+        if (data_option == "rate") {
+            data_to_eval <- output$rate
+        } else if (data_option == "cumulative") {
+            data_to_eval <- output$cumulative
+        } else {
+            stop("Data_option is not valid.")
+        }
+
+        ## evaluation ##
+
+        # MSPE #
+        # So far, only use MSPE.
+        # Note: What can be the code structure if multiple metrics are evaluated simultaneously
+        if (evaluate_option == "mspe") {
+            metric <- get_mspe(data_to_eval$obs, data_to_eval$mean)
+        }
+
+        metrics <- rbind(
+            metrics,
+            data.frame(
+                forecast_date = split_date,
+                mspe = metric
+            )
+        )
+    }
+
+    return(metrics)
+}
diff --git a/scripts/evaluation_script.r b/scripts/evaluation_script.r
@@ -0,0 +1,58 @@
+rm(list = ls())
+# read all the R files under R folder #
+R_path <- "R/"
+purrr::map(paste0(R_path, list.files("R/")), source)
+
+# load observed data #
+
+# Load 2022 NIS data for USA
+nis_usa_2022 <- get_uptake_data(
+    "https://data.cdc.gov/api/views/akkj-j5ru/rows.csv?accessType=DOWNLOAD",
+    state = "Geography",
+    date = c("Time.Period", "Year"),
+    cumulative = "Estimate....",
+    state_key = "data/USA_Name_Key.csv",
+    date_format = "%m/%d/%Y",
+    start_date = "9/2/2022",
+    filters = list(c(
+        "Indicator.Category",
+        "Received updated bivalent booster dose (among adults who completed primary series)"
+    ))
+)
+
+# Load 2023 NIS data for USA
+nis_usa_2023 <- get_uptake_data(
+    "data/NIS_2023-24.csv",
+    state = "geography",
+    date = "date",
+    cumulative = "estimate",
+    state_key = "data/USA_Name_Key.csv",
+    date_format = "%m/%d/%Y",
+    start_date = "9/12/2023",
+    filters = list(c("time_type", "Weekly"), c("group_name", "Overall"))
+)
+
+
+# generate projections initiated at different time #
+all_data <- rbind(nis_usa_2022, nis_usa_2023)
+
+# make sure there at least 4 data points for model training: 30 min to run #
+plot_real_time_projection(
+    data_option = "rate",
+    all_data = all_data
+)
+
+# evaluate MSPE between projections and data initiated at different time: 30 min to run #
+evaluate_real_time_projection(evaluate_option = "mspe", data_option = "rate", all_data = all_data) -> mspe_df
+
+mspe_df %>%
+    mutate(forecast_date = forecast_date + as.Date("1970-01-01")) %>%
+    filter(mspe < 1) %>%
+    ggplot() +
+    geom_point(aes(x = forecast_date, y = mspe)) +
+    xlab("Forecast date") +
+    ylab("MSPE") +
+    theme_bw()
+ggsave("mspe.jpeg", width = 4, height = 3, units = "in")
+# Note for future upgrade: seperate model fitting from plotting and evaluation.
+# Repeatedly fitting the model to plot and evaluate is not time-efficient.