flight-delay-prediction.Rmd

---
title: "Study of delay prediction in the US airport network"
author: "Kerim Kiliç"
subtitle: "Supervised Machined Learning using flight data"
output:
  html_document:
    df_print: paged
    toc: true
    toc_depth: 2
    number_sections: true
    toc_float: true
---

# Libraries

The following libraries are used in this R markdown file.

```{r setup, message=FALSE}
markdown_start_time <- Sys.time()
knitr::opts_chunk$set(echo = TRUE)
library(data.table)
library(tidymodels)
library(sparklyr)
library(kableExtra)
source("src/functions.R")
library(h2o)
library(rsparkling)
library(worldmet)
library(janitor)
```

# Initialize spark, h2o and read in the raw data

## Initialize spark and h2o

```{r, spark_h2o_setup}
### Check if spark installation exists, if not install correct version
if(!spark_install_find("3.3.0")$installed)
{
  spark_install(version = "3.3.0")
}
### Initialize spark
spark_config <- spark_config()
# Change memory based on your machine: i.e. 12 GB available RAM -> "12G" etc.
spark_config$'sparklyr.shell.driver-memory' <- "12G"

sc <- spark_connect(master = "local",
                    config = spark_config,
                    version = "3.3.0")
### Initialize h2o
h2oConf <- rsparkling::H2OConf()
hc <- H2OContext.getOrCreate(h2oConf)
```

## Read in raw data

Let's read in the raw flight data of 2017 and glimpse into the different variables.

```{r, read_raw_data}
### Check if csv of the data file exists, if not create one.
if(!file.exists("data/flights_2017.csv"))
{
  my_data <- readRDS("data/flights_2017.RDS")
  fwrite(my_data,file = "data/flights_2017.csv")  
}

raw_data <- spark_read_csv(sc,"flights_data","data/flights_2017.csv",memory=FALSE)
raw_data %>% glimpse()
```

Let's check the total number of rows in the data set.

```{r, raw_data_rows}
sdf_nrow(raw_data)
```

# Create datacleaning pipeline

```{r, pipeline}
main_pipeline <- .%>%
  mutate(delay_time = actual_arrival_time - planned_arrival_time,
         delay_time = minute(delay_time) + (hour(delay_time)*60) + (second(delay_time)/60),
         delay = case_when(delay_time >= 15 ~ "1",
                           delay_time < 15 ~ "0"),
         date = paste0(year,"-",month,"-",day_of_month),
         date = as.Date(date),
         flight_time = planned_arrival_time - planned_departure_time,
         flight_time = minute(flight_time) + (hour(flight_time)*60) + (second(flight_time)/60),
         speed = flight_distance / flight_time) %>%
  filter(origin %in% airports , destination %in% airports) %>%
  select(date,
         quarter,
         month,
         day_of_month,
         day_of_week,
         flight_distance,
         seating_capacity,
         origin,
         destination,
         carrier,
         delay,
         delay_time,
         flight_time,
         speed,
         planned_arrival_local_hour,
         planned_departure_local_hour)

# Create the numerical delay sub-pipeline
numer_delay_pipeline <- .%>% main_pipeline %>%
  select(-delay)

# Create the classification sub-pipeline
class_delay_pipeline <- .%>% main_pipeline %>%
  select(-delay_time) %>%
  group_by(delay)

# Pipeline to add one-hot-encoding:
one_hot_features <- c("origin",
                      "destination",
                      "carrier")
one_hot_features_ind <- paste0(one_hot_features,"_ind")
one_hot_features_out <- paste0(one_hot_features,"_out")
one_hot_encoding_pipeline <- . %>% 
  ft_string_indexer(input_col = one_hot_features[1], output_col = one_hot_features_ind[1]) %>%
  ft_string_indexer(input_col = one_hot_features[2], output_col = one_hot_features_ind[2]) %>%
  ft_string_indexer(input_col = one_hot_features[3], output_col = one_hot_features_ind[3]) %>%
  ft_one_hot_encoder(input_cols = one_hot_features_ind, output_cols = one_hot_features_out)%>%
  select(-origin,-destination, -carrier) %>%
  ft_vector_assembler(input_cols = one_hot_features_out,
                      output_col = "one_hot_output")

adding_extra_features <- . %>%
#   ### Join the weather data of the origin airport
  left_join(weather_data,by=c("origin"="origin","date"="date")) %>%
  left_join(., a_b_joined1,
            by = c("destination"="destination", "date"="date", "planned_arrival_local_hour"="planned_arrival_local_hour")) %>%
  left_join(., a_b_joined2,
            by = c("origin"="origin", "date"="date","planned_departure_local_hour"="planned_departure_local_hour")) %>%
  mutate(total_flights_destination = case_when(is.na(total_flights_destination) ~ 0,
                                     !is.na(total_flights_destination) ~ total_flights_destination),
         total_flights_origin = case_when(is.na(total_flights_origin) ~ 0,
                                     !is.na(total_flights_origin) ~ total_flights_origin)) %>%
  left_join(., class_delay_pipeline(raw_data) %>% group_by(origin,carrier,date) %>% summarise(departing_carrier_flights = n()),
            by = c("origin"="origin", "carrier"="carrier", "date"="date")) %>%
  left_join(., weather_data2,by=c("destination"="destination","date"="date")) %>%
  select(-date)
  na.omit()

# Final classification pipeline for Spark and h2o models
spark_class_pipeline <- . %>% class_delay_pipeline %>% adding_extra_features %>% one_hot_encoding_pipeline
h2o_class_pipeline <- . %>% class_delay_pipeline %>% adding_extra_features
```

# Top 10 airports

Check the top 10 airports with the most departure and arrival flights, to narrow down the data set further.

```{r, top_10_airports}
### Check the top 10 origin and destination with the most flight traffic to narrow down the data
raw_data_tmp <- raw_data %>% select(origin,destination) %>% collect()
tmp1 <- data.frame(table(raw_data_tmp$origin))
tmp2 <- data.frame(table(raw_data_tmp$destination))
tmp1 <- tmp1[order(-tmp1$Freq),] %>% head(10)
tmp2 <- tmp2[order(-tmp2$Freq),] %>% head(10)
airports <- tmp1 %>% 
  mutate(Var1 = as.character(Var1)) %>%
  pull(Var1)
rm(raw_data_tmp)
rm(tmp1,tmp2)
```

# Prepare weather data to use as features

Get the weather data that corresponds to the top 10 airports.

```{r, weather_data, message=FALSE, fig.show='hide', results='hide'}
if(!file.exists("data/origin_weather_data.csv") | !file.exists("data/destination_weather_data.csv"))
{
  get_weather_data(sc)
}
weather_data <- spark_read_csv(sc,"origin_weather_data","data/origin_weather_data.csv",memory=FALSE)
weather_data2 <- spark_read_csv(sc,"destination_weather_data","data/destination_weather_data.csv",memory=FALSE)
```

Glimpse into the weather data of the origin airports.

```{r, origin_weather_data_glimpse}
weather_data %>% glimpse()
```

Glimpse into the weather data of the destination airports.

```{r, destination_weather_data_glimpse}
weather_data2 %>% glimpse()
```

```{r}
### Adding the airport congestion
a1 <- main_pipeline(raw_data) %>% group_by(destination,date,planned_arrival_local_hour) %>% summarise(total_arrival_flights1=n())
a2 <- main_pipeline(raw_data) %>% group_by(origin,date,planned_departure_local_hour) %>% summarise(total_departing_flights1=n())
a_b_joined1 <- left_join(a1,a2,by = c("destination"="origin","date"="date","planned_arrival_local_hour"="planned_departure_local_hour")) %>%
  mutate(total_flights_destination = total_arrival_flights1 + total_departing_flights1) %>%
  select(destination,date,total_flights_destination,planned_arrival_local_hour)
a_b_joined2 <- left_join(a2,a1,by = c("origin"="destination","date"="date","planned_departure_local_hour"="planned_arrival_local_hour")) %>%
  mutate(total_flights_origin = total_arrival_flights1 + total_departing_flights1) %>%
  select(origin,date,total_flights_origin,planned_departure_local_hour)
###
```

# Splitting data in train and test sets

## Creating the train and test sets for spark

Perform train test split for spark.

```{r, spark_data_split, message=FALSE}
classification_split <- create_train_test_split_(data = spark_class_pipeline(raw_data),
                                                 ratio = 0.9,
                                                 type = "spark",
                                                 hc = hc)

train_data <- classification_split$train_data
test_data <- classification_split$test_data
```

Let's glimpse into the train data for classification for spark.

```{r, train_glimpse}
train_data %>% glimpse()
```

## Creating the train, validation, and test sets for h2o

Create the train, validation, test, split to use with the h2o framework.

```{r, h2o_data_split, message=FALSE}
h2o_classification_split <- create_train_test_split(data = h2o_class_pipeline(raw_data),
                                                    ratio = 0.8,
                                                    type = "h2o",
                                                    hc = hc)

train_data_h2o <- hc$asH2OFrame(h2o_classification_split$train_data)
valid_data_h2o <- hc$asH2OFrame(h2o_classification_split$valid_data)
test_data_h2o <- hc$asH2OFrame(h2o_classification_split$test_data)
```


```{r, clear_data, message=FALSE}
rm(raw_data)
rm(classification_split)
rm(h2o_classification_split)
rm(weather_data)
rm(weather_data2)
rm(a_b_joined1,a_b_joined2)
rm(a1,a2)
gc()
gc()
h2o:::.h2o.garbageCollect()
h2o:::.h2o.garbageCollect()
h2o:::.h2o.garbageCollect()
sc %>% spark_session %>% invoke("catalog") %>%
  invoke("dropTempView","flights_data")
sc %>% spark_session %>% invoke("catalog") %>%
  invoke("dropTempView","origin_weather_data")
sc %>% spark_session %>% invoke("catalog") %>%
  invoke("dropTempView","destination_weather_data")
```

# Classification of flight delays

In this section we will build and evaluate different machine learning models to predict if a given inbound flight in the United States will have a delay based on the data prepared in the previous sections.

## Logistic regression model

In this section we will build a logistic regression pipeline and cross-validate and hyper-parameter tune a logistic regression model.

### Building a logistic regression pipeline 

Below we build a ML pipeline for a logistic regression model to use cross validation as we perform hyper parameter tuning on our model.

```{r, glm_model_cv}
# Pipeline
glm_pipeline <- ml_pipeline(sc) %>%
  ft_r_formula(delay ~ .) %>%
  ml_logistic_regression()

# Grid
grid <- list(logistic_regression = list(elastic_net_param = c(0,0.25,0.5,0.75,1), reg_param = c(0,0.25,0.5,0.75,1)))

# Cross validate model
glm_cv <- cross_validator(sc = sc,
                          data = train_data,
                          pipeline = glm_pipeline,
                          grid = grid,
                          type = "classification",
                          folds = 4,
                          seed = 2018)

# Get model results
a <- glm_cv$all_results
glm_cv_best_result <- glm_cv$best_result
glm_cv_result <- glm_cv_best_result[which.max(glm_cv_best_result$accuracy),"accuracy"]
glm_train_time <- glm_cv$train_time
a[order(-a$accuracy),] %>% 
  head(5) %>%
  kbl() %>%
  kable_minimal()
```

### Train a tuned logistic regression model

Train a logistic regression model using the full training data set and the parameters that rolled out of the cross validation with hyper model parameter tuning.

```{r, glm_final_model}
### Train a logistic regression model ###
glm_tuned_start_time <- Sys.time()
glm_model <- ml_logistic_regression(train_data, "delay ~ .",
                                    elastic_net_param = glm_cv_best_result[which.max(glm_cv_best_result$accuracy),"elastic_net_param_1"],
                                    reg_param = glm_cv_best_result[which.max(glm_cv_best_result$accuracy),"reg_param_1"])
glm_tuned_end_time <- Sys.time()
glm_tuned_train_time <- glm_tuned_end_time - glm_tuned_start_time
### Performance on train set

glm_tuned_result_train <- generate_metrics_classification(model = glm_model,
                                                          type = "train")

glm_tuned_result_train %>% 
  kbl() %>%
  kable_minimal()
```

```{r}
thresholds <- seq(to=0,from=1,by=-0.01)
predictions <- ml_predict(glm_model,test_data) %>% select(probability_1,probability_0,delay)
predictions$model <- "GLM"
TPR_value <- list()
FPR_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(probability_1 >= item ~ "1",
                                  probability_1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  TPR <- TP/(TP + FN)
  FPR <- FP/(TN + FP)
  TPR_value <- append(TPR_value,TPR)
  FPR_value <- append(FPR_value,FPR)
}

glm_roc <- do.call(rbind, Map(data.frame, TPR_value=TPR_value, FPR_value=FPR_value))
glm_roc$model <- "GLM"

ggplot(glm_roc, aes(x = FPR_value, y = TPR_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) + geom_abline(lty="dashed")+
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Receiver Operating Characteristic curve") +
  xlab("False Positive Rate (FPR)") +
  ylab("True Positive Rate (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
precision_value <- list()
recall_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(probability_1 >= item ~ "1",
                                  probability_1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  precision_tmp <- TP/(TP + FP)
  recall_tmp <- TP/(TP + FN)
  precision_value <- append(precision_value,precision_tmp)
  recall_value <- append(recall_value,recall_tmp)
}

glm_pr <- do.call(rbind, Map(data.frame, precision_value=precision_value, recall_value=recall_value))
glm_pr$model <- "GLM"

ggplot(glm_pr, aes(x = recall_value, y = precision_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Precision Recall curve") +
  xlab("Recall ((TP)/(TP+FP))") +
  ylab("Precision (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

Save the model to be able to reuse it later on predictions.

```{r, glm_save_model, message=FALSE}
ml_save(glm_model, "models/glm_model", overwrite = TRUE)
```

Make a prediction based on test data and get model performance.

```{r, glm_metrics}
### Performance on test set
glm_tuned_result <- generate_metrics_classification(glm_model,
                                                    "test",
                                                    test_data)

glm_tuned_result %>% 
  kbl() %>%
  kable_minimal()
```

```{r}
glm_confusion_matrix <- confusion_matrix_elements(glm_model,test_data,"spark")

glm_confusion_matrix$TP
glm_confusion_matrix$TN
glm_confusion_matrix$FP
glm_confusion_matrix$FN
```

## Random forest model

In this section we will build a random forest pipeline and cross-validate and hyper-parameter tune a random forest model.

### Building a random forest pipeline

```{r, rf_model_cv}
# Pipeline
rf_pipeline <- ml_pipeline(sc) %>%
  ft_r_formula(delay ~ .) %>%
  ml_random_forest_classifier()

# Grid
grid <- list(random_forest = list(max_depth = c(1,3,5,7,10), num_trees = c(1,3,5,7,10,25,50)))

# Cross validate model
rf_cv <- cross_validator(sc = sc,
                         data = train_data,
                         pipeline = rf_pipeline,
                         grid = grid,
                         type = "classification",
                         folds = 4,
                         seed = 2018)

# Get model results
a <- rf_cv$all_results
rf_cv_best_result <- rf_cv$best_result
rf_cv_result <- rf_cv_best_result[which.max(rf_cv_best_result$accuracy),"accuracy"]
rf_train_time <- rf_cv$train_time
a[order(-a$accuracy),] %>% 
  head(5) %>%
  kbl() %>%
  kable_minimal()
```

### Train a random forest model

Train a random forest model using the train dataset and get model performance on training data.

```{r, rf_final_model}
### Train a decision tree model ###
rf_tuned_start_time <- Sys.time()
rf_model <- ml_random_forest_classifier(train_data, 
                                        "delay ~ .", 
                                        num_trees = rf_cv_best_result[which.max(rf_cv_best_result$accuracy),"num_trees_1"],
                                        max_depth = rf_cv_best_result[which.max(rf_cv_best_result$accuracy),"max_depth_1"])
rf_tuned_end_time <- Sys.time()
rf_tuned_train_time <- rf_tuned_end_time - rf_tuned_start_time

### Performance on train set
rf_tuned_result_train <- generate_metrics_classification(rf_model,"train")

rf_tuned_result_train %>% 
  kbl() %>%
  kable_minimal()
```

```{r}
thresholds <- seq(to=0,from=1,by=-0.01)
predictions <- ml_predict(rf_model,test_data) %>% select(probability_1,probability_0,delay)
predictions$model <- "RF"
TPR_value <- list()
FPR_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(probability_1 >= item ~ "1",
                                  probability_1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  TPR <- TP/(TP + FN)
  FPR <- FP/(TN + FP)
  TPR_value <- append(TPR_value,TPR)
  FPR_value <- append(FPR_value,FPR)
}

rf_roc <- do.call(rbind, Map(data.frame, TPR_value=TPR_value, FPR_value=FPR_value))
rf_roc$model <- "RF"

ggplot(rf_roc, aes(x = FPR_value, y = TPR_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) + geom_abline(lty="dashed")+
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Receiver Operating Characteristic curve") +
  xlab("False Positive Rate (FPR)") +
  ylab("True Positive Rate (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
precision_value <- list()
recall_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(probability_1 >= item ~ "1",
                                  probability_1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  precision_tmp <- TP/(TP + FP)
  recall_tmp <- TP/(TP + FN)
  precision_value <- append(precision_value,precision_tmp)
  recall_value <- append(recall_value,recall_tmp)
}

rf_pr <- do.call(rbind, Map(data.frame, precision_value=precision_value, recall_value=recall_value))
rf_pr$model <- "RF"

ggplot(rf_pr, aes(x = recall_value, y = precision_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Precision Recall curve") +
  xlab("Recall ((TP)/(TP+FP))") +
  ylab("Precision (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

Save the model to be able to reuse it later on predictions.

```{r, rf_model_save, message=FALSE}
ml_save(rf_model, "models/rf_model_classification", overwrite = TRUE)
```

Make a prediction based on test data and get model performance.

```{r, rf_model_metrics}
### Performance on test set
rf_tuned_result <- generate_metrics_classification(rf_model,
                                                    "test",
                                                    test_data)

rf_tuned_result %>% 
  kbl() %>%
  kable_minimal()
```

```{r}
rf_confusion_matrix <- confusion_matrix_elements(rf_model,test_data,"spark")

rf_confusion_matrix$TP
rf_confusion_matrix$TN
rf_confusion_matrix$FP
rf_confusion_matrix$FN
```

```{r}
rm(train_data,
   test_data)
gc()
gc()
h2o:::.h2o.garbageCollect()
h2o:::.h2o.garbageCollect()
h2o:::.h2o.garbageCollect()
```

## Training a gradient boosting machine with h2o

In this section we will train a gradient boosted machine model using the h2o framework. 

Define the variables to use in model training, cross validation and hyper parameter tuning.

```{r, h2o_set_x_y, message=FALSE, fig.show='hide', results='hide'}
y <- "delay"
x <- setdiff(names(train_data_h2o), y)
```

### Hyper-parameter tuning the GBM model

Perform hyper-parameter tuning to figure out the best value for depth.

```{r, gbm_tune1, message=FALSE, fig.show='hide', results='hide'}
### Define the range of depth to tune for
hyper_params = list( max_depth = seq(1,29,2) )

grid <- h2o.grid(
  ## hyper parameters
  hyper_params = hyper_params,
  ## full Cartesian hyper-parameter search
  search_criteria = list(strategy = "Cartesian"),
  ## which algorithm to run
  algorithm="gbm",
  ## identifier for the grid, to later retrieve it
  grid_id="depth_grid",
  ## standard model parameters
  x = x,
  y = y,
  training_frame = train_data_h2o,
  validation_frame = valid_data_h2o,
  ## more trees is better if the learning rate is small enough
  ## here, use "more than enough" trees - we have early stopping
  ntrees = 10000,
  ## smaller learning rate is better
  ## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
  learn_rate = 0.05,
  ## learning rate annealing: learning_rate shrinks by 1% after every tree
  ## (use 1.00 to disable, but then lower the learning_rate)
  learn_rate_annealing = 0.99,
  ## sample 80% of rows per tree
  sample_rate = 0.8,
  ## sample 80% of columns per split
  col_sample_rate = 0.8,
  ## fix a random number generator seed for reproducibility
  seed = 1234,
  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5,
  stopping_tolerance = 1e-2,
  stopping_metric = "AUC",
  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10,
  categorical_encoding = "auto",
  max_runtime_secs = 14400
)
### Get the grid and sort decreasing by AUC 
sortedGrid <- h2o.getGrid("depth_grid", sort_by="auc", decreasing = TRUE)
topDepths = sortedGrid@summary_table$max_depth[1:5]
minDepth = min(as.numeric(topDepths))
maxDepth = max(as.numeric(topDepths))
```

Hyper parameter tune the remaining parameters using random search.

```{r, gbm_tune2, message=FALSE, fig.show='hide', results='hide'}
hyper_params = list(
  ## restrict the search to the range of max_depth established above
  max_depth = seq(minDepth,maxDepth,1),
  ## search a large space of row sampling rates per tree
  sample_rate = seq(0.2,1,0.01),
  ## search a large space of column sampling rates per split
  col_sample_rate = seq(0.2,1,0.01),
  ## search a large space of column sampling rates per tree
  col_sample_rate_per_tree = seq(0.2,1,0.01),
  ## search a large space of how column sampling per split should change as a function of the depth of the split
  col_sample_rate_change_per_level = seq(0.9,1.1,0.01),
  ## search a large space of the number of min rows in a terminal node
  min_rows = 2^seq(0,log2(nrow(train_data_h2o))-1,1),
  ## search a large space of the number of bins for split-finding for continuous and integer columns
  nbins = 2^seq(4,10,1),
  ## search a large space of the number of bins for split-finding for categorical columns
  nbins_cats = 2^seq(4,12,1),
  ## search a few minimum required relative error improvement thresholds for a split to happen
  min_split_improvement = c(0,1e-8,1e-6,1e-4),
  ## try all histogram types (QuantilesGlobal and RoundRobin are good for numeric columns with outliers)
  histogram_type = c("UniformAdaptive","QuantilesGlobal","RoundRobin")
)
search_criteria = list(
  ## Random grid search
  strategy = "RandomDiscrete",
  ## limit the runtime to 60 minutes
  max_runtime_secs = 14400,
  ## build no more than 100 models
  max_models = 100,
  ## random number generator seed to make sampling of parameter combinations reproducible
  seed = 1234,
  ## early stopping once the leaderboard of the top 5 models is converged to 0.1% relative difference
  stopping_rounds = 5,
  stopping_metric = "AUC",
  stopping_tolerance = 1e-3
)
grid <- h2o.grid(
  ## hyper parameters
  hyper_params = hyper_params,
  ## hyper-parameter search configuration (see above)
  search_criteria = search_criteria,
  ## which algorithm to run
  algorithm = "gbm",
  ## identifier for the grid, to later retrieve it
  grid_id = "final_grid",
  ## standard model parameters
  x = x,
  y = y,
  training_frame = train_data_h2o,
  validation_frame = valid_data_h2o,
  ## more trees is better if the learning rate is small enough
  ## use "more than enough" trees - we have early stopping
  ntrees = 10000,
  ## smaller learning rate is better
  ## since we have learning_rate_annealing, we can afford to start with a bigger learning rate
  learn_rate = 0.05,
  ## learning rate annealing: learning_rate shrinks by 1% after every tree
  ## (use 1.00 to disable, but then lower the learning_rate)
  learn_rate_annealing = 0.99,
  ## early stopping based on timeout (no model should take more than 1 hour - modify as needed)
  max_runtime_secs = 36000,
  ## early stopping once the validation AUC doesn't improve by at least 0.01% for 5 consecutive scoring events
  stopping_rounds = 5, stopping_tolerance = 1e-2, stopping_metric = "AUC",
  ## score every 10 trees to make early stopping reproducible (it depends on the scoring interval)
  score_tree_interval = 10,
  ## base random number generator seed for each model (automatically gets incremented internally for each model)
  seed = 1234,
  categorical_encoding = "auto"
)
## Sort the grid models by AUC and select the best performing model
sortedGrid <- h2o.getGrid("final_grid", sort_by = "auc", decreasing = TRUE)
gbm <- h2o.getModel(sortedGrid@model_ids[[1]])
```

```{r}
sortedGrid
```

Show the metrics of the model with the thresholds

```{r, gbm_metrics}
gbm@model$validation_metrics@metrics$max_criteria_and_metric_scores[c(1,4:7),] %>%
  mutate(threshold = round(threshold,3),
         value = round(value,3)) %>%
  select(metric,threshold,value) %>%
  kbl() %>%
  kable_minimal()
```

### Train the tuned model using the whole training set

Build a model on the whole training set:

```{r, gbm_final_model, message=FALSE, fig.show='hide', results='hide'}
gbm_tuned_start_time <- Sys.time()
final_gbm_model <- do.call(h2o.gbm,
        ## update parameters in place
        {
          p <- gbm@parameters
          p$model_id = NULL          ## do not overwrite the original grid model
          p$training_frame = h2o.rbind(train_data_h2o, valid_data_h2o)      ## use the full dataset
          p$validation_frame = NULL  ## no validation frame
          p$nfolds = 4               ## cross-validation
          p$max_runtime_secs = 36000
          p
        }
)
gbm_tuned_end_time <- Sys.time()
gbm_tuned_train_time <- gbm_tuned_end_time - gbm_tuned_start_time
```

```{r}
thresholds <- seq(to=0,from=1,by=-0.01)
predictions <- as.data.frame(h2o.predict(final_gbm_model,test_data_h2o)) %>% select(p0,p1)
predictions$delay <- as.vector(test_data_h2o$delay)
predictions$model <- "GBM"
predictions <- copy_to(dest = sc,
                       df = predictions,
                       overwrite = TRUE)


TPR_value <- list()
FPR_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(p1 >= item ~ "1",
                                  p1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  TPR <- TP/(TP + FN)
  FPR <- FP/(TN + FP)
  TPR_value <- append(TPR_value,TPR)
  FPR_value <- append(FPR_value,FPR)
}

gbm_roc <- do.call(rbind, Map(data.frame, TPR_value=TPR_value, FPR_value=FPR_value))
gbm_roc$model <- "GBM"

ggplot(gbm_roc, aes(x = FPR_value, y = TPR_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) + geom_abline(lty="dashed")+
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Receiver Operating Characteristic curve") +
  xlab("False Positive Rate (FPR)") +
  ylab("True Positive Rate (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
precision_value <- list()
recall_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(p1 >= item ~ "1",
                                  p1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  precision_tmp <- TP/(TP + FP)
  recall_tmp <- TP/(TP + FN)
  precision_value <- append(precision_value,precision_tmp)
  recall_value <- append(recall_value,recall_tmp)
}

gbm_pr <- do.call(rbind, Map(data.frame, precision_value=precision_value, recall_value=recall_value))
gbm_pr$model <- "GBM"

ggplot(gbm_pr, aes(x = recall_value, y = precision_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Precision Recall curve") +
  xlab("Recall ((TP)/(TP+FP))") +
  ylab("Precision (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

Model metrics of the final tuned model:

```{r, gbm_tuned_metrics_train}
gbm_metrics_final_train <- generate_metrics_classification(final_gbm_model,
                                                           "h2o_classification_train")

gbm_metrics_final_train %>%
  kbl() %>%
  kable_minimal()
```

Model metrics of the final tuned model on the test set:

```{r, gbm_tuned_metrics_test}
gbm_metrics_final_test <- generate_metrics_classification(model = final_gbm_model,
                                                          type = "h2o_classification_test",
                                                          test_data = test_data_h2o,
                                                          sc = sc)
gbm_metrics_final_test %>%
  kbl() %>%
  kable_minimal()
```

```{r}
gbm_confusion_matrix <- confusion_matrix_elements(final_gbm_model,test_data_h2o,"h2o")

gbm_confusion_matrix$TP
gbm_confusion_matrix$TN
gbm_confusion_matrix$FP
gbm_confusion_matrix$FN
```

Save the model:

```{r, gbm_save_model, message=FALSE, fig.show='hide', results='hide'}
h2o.saveModel(gbm, "models/h2o/gbm_classification.csv", force=TRUE)
```

## Train a deep learning model using the h2o framework

In this section we will train, hyper parameter tune, and cross validate a deep learning model.

### Hyper-parameter tuning the deep learning model

Hyper parameter tuning with grid search

```{r, dl_tune, message=FALSE, fig.show='hide', results='hide'}
hyper_params <- list(
  hidden=list(c(32,32,32),c(64,64),c(100,100,100)),
  input_dropout_ratio=c(0,0.05,0.15),
  rate=c(0.01,0.02,1e-3,1e-4),
  rate_annealing=c(1e-8,1e-7,1e-6),
  activation=c("Rectifier with dropout","Tanh with dropout")
)

grid <- h2o.grid(
  algorithm="deeplearning",
  grid_id="dl_grid", 
  training_frame=train_data_h2o,
  validation_frame=valid_data_h2o, 
  x=x, 
  y=y,
  epochs=50,
  ### stop when misclassification does not improve by >=1% for 2 scoring events
  stopping_metric="misclassification",
  stopping_tolerance=1e-2,        
  stopping_rounds=2,
  ### downsample validation set for faster scoring
  # score_validation_samples=10000,
  ### don't score more than 2.5% of the wall time
  score_duty_cycle=0.025,         
  ### Settings for manual or adaptive learning
  adaptive_rate=F,                
  momentum_start=0.5,             
  momentum_stable=0.9, 
  momentum_ramp=1e7, 
  l1=1e-5,
  l2=1e-5,
  ### can help improve stability for Rectifier
  max_w2=10,                    
  hyper_params=hyper_params,
  categorical_encoding = "auto", 
  max_runtime_secs = 36000
)
## Sort the grid models by AUC
sortedGrid <- h2o.getGrid("dl_grid", sort_by = "auc", decreasing = TRUE)
best_dl_model <- h2o.getModel(grid@model_ids[[1]])
```

```{r}
sortedGrid
```

Show the metrics of the model with the thresholds

```{r, dl_metrics}
best_dl_model@model$validation_metrics@metrics$max_criteria_and_metric_scores[c(1,4:7),] %>%
  mutate(threshold = round(threshold,3),
         value = round(value,3)) %>%
  select(metric,threshold,value) %>%
  kbl() %>%
  kable_minimal()
```

### Train a tuned deep learning model using the entire training set

Build a model on the whole training set:

```{r, dl_final_model, message=FALSE, fig.show='hide', results='hide'}
dl_tuned_start_time <- Sys.time()
final_dl_model <- do.call(h2o.deeplearning,
        ## update parameters in place
        {
          p <- best_dl_model@parameters
          p$model_id = NULL          ## do not overwrite the original grid model
          p$training_frame = h2o.rbind(train_data_h2o, valid_data_h2o)      ## use the full dataset
          p$validation_frame = NULL  ## no validation frame
          p$nfolds = 4               ## cross-validation
          p
        }
)
dl_tuned_end_time <- Sys.time()
dl_tuned_train_time <- dl_tuned_end_time - dl_tuned_start_time
```

```{r}
thresholds <- seq(to=0,from=1,by=-0.01)
predictions <- as.data.frame(h2o.predict(final_dl_model,test_data_h2o)) %>% select(p0,p1)
predictions$delay <- as.vector(test_data_h2o$delay)
predictions$model <- "DL"
predictions <- copy_to(dest = sc,
                       df = predictions,
                       overwrite = TRUE)


TPR_value <- list()
FPR_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(p1 >= item ~ "1",
                                  p1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  TPR <- TP/(TP + FN)
  FPR <- FP/(TN + FP)
  TPR_value <- append(TPR_value,TPR)
  FPR_value <- append(FPR_value,FPR)
}

dl_roc <- do.call(rbind, Map(data.frame, TPR_value=TPR_value, FPR_value=FPR_value))
dl_roc$model <- "DL"

ggplot(dl_roc, aes(x = FPR_value, y = TPR_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) + geom_abline(lty="dashed")+
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Receiver Operating Characteristic curve") +
  xlab("False Positive Rate (FPR)") +
  ylab("True Positive Rate (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
precision_value <- list()
recall_value <- list()
for (item in thresholds) {
  threshold_100 <- predictions %>%
    mutate(prediction = case_when(p1 >= item ~ "1",
                                  p1 < item ~ "0"))

  TP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "1"))
  TN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "0"))
  FN <- sdf_nrow(threshold_100 %>% filter(prediction == "0", delay == "1"))
  FP <- sdf_nrow(threshold_100 %>% filter(prediction == "1", delay == "0"))
  precision_tmp <- TP/(TP + FP)
  recall_tmp <- TP/(TP + FN)
  precision_value <- append(precision_value,precision_tmp)
  recall_value <- append(recall_value,recall_tmp)
}

dl_pr <- do.call(rbind, Map(data.frame, precision_value=precision_value, recall_value=recall_value))
dl_pr$model <- "DL"

ggplot(dl_pr, aes(x = recall_value, y = precision_value)) +
  geom_line(colour = "#0000ff",linetype = "longdash", linewidth=1) +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Precision Recall curve") +
  xlab("Recall ((TP)/(TP+FP))") +
  ylab("Precision (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
roc_combined <- rbind(glm_roc,rf_roc,dl_roc,gbm_roc)
ggplot(roc_combined, aes(x = FPR_value, y = TPR_value)) +
  geom_line(aes(color = model, linetype = model), linewidth=1) + geom_abline(lty="dotted") +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Receiver Operating Characteristic curve comparison") +
  xlab("False Positive Rate (FPR)") +
  ylab("True Positive Rate (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

```{r}
pr_combined <- rbind(glm_pr,rf_pr,dl_pr,gbm_pr)
ggplot(pr_combined, aes(x = recall_value, y = precision_value)) +
  geom_line(aes(color = model, linetype = model), linewidth=1) +
  scale_x_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0,1)) +
  scale_y_continuous(breaks=c(0,0.2,0.4,0.6,0.8,1),limits = c(0, 1)) +
  labs(title = "Precision Recall curve comparison") +
  xlab("Recall ((TP)/(TP+FP))") +
  ylab("Precision (TPR)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5,face="bold"))
```

Model metrics of the final tuned model:

```{r, dl_metrics_final_train}
dl_metrics_final_train <- generate_metrics_classification(final_dl_model,
                                                          "h2o_classification_train")
dl_metrics_final_train %>%
  kbl() %>%
  kable_minimal()
```

Model metrics of the final tuned model on the test set:

```{r, dl_metrics_final_test}
dl_metrics_final_test <- generate_metrics_classification(model = final_dl_model,
                                                          type = "h2o_classification_test",
                                                          test_data = test_data_h2o,
                                                          sc = sc)
dl_metrics_final_test %>%
  kbl() %>%
  kable_minimal()
```

```{r}
dl_confusion_matrix <- confusion_matrix_elements(final_dl_model,test_data_h2o,"h2o")

dl_confusion_matrix$TP
dl_confusion_matrix$TN
dl_confusion_matrix$FP
dl_confusion_matrix$FN
```

Save the model:

```{r, dl_save_model, message=FALSE, fig.show='hide', results='hide'}
h2o.saveModel(best_dl_model, "models/h2o/dl_classification.csv", force=TRUE)
```

# Summary of results

Summary of model metrics on the train set

```{r, class_results_spark_train, echo=FALSE}
model_type <- c("glm", 
                "random forest",
                "gbm",
                "deep learning")
train_time <- round(c(glm_tuned_train_time, 
                      rf_tuned_train_time,
                      gbm_tuned_train_time,
                      dl_tuned_train_time),3)

accuracy_results_train <- c(glm_tuned_result_train[1,]$value, 
                            rf_tuned_result_train[1,]$value,
                            gbm_metrics_final_train[1,]$value,
                            dl_metrics_final_train[1,]$value)

recall_results_train <- c(glm_tuned_result_train[2,]$value, 
                          rf_tuned_result_train[2,]$value,
                          gbm_metrics_final_train[2,]$value,
                          dl_metrics_final_train[2,]$value)

precision_results_train <- c(glm_tuned_result_train[3,]$value, 
                             rf_tuned_result_train[3,]$value,
                             gbm_metrics_final_train[3,]$value,
                             dl_metrics_final_train[3,]$value)

missclassification_results_train<- c(glm_tuned_result_train[4,]$value, 
                                     rf_tuned_result_train[4,]$value,
                                     gbm_metrics_final_train[4,]$value,
                                     dl_metrics_final_train[4,]$value)

specificity_results_train <- c(glm_tuned_result_train[5,]$value, 
                               rf_tuned_result_train[5,]$value,
                               gbm_metrics_final_train[5,]$value,
                               dl_metrics_final_train[5,]$value)

f1_results_train <- c(glm_tuned_result_train[6,]$value, 
                      rf_tuned_result_train[6,]$value,
                      gbm_metrics_final_train[6,]$value,
                      dl_metrics_final_train[6,]$value)

roc_auc_results_train <- c(glm_tuned_result_train[7,]$value, 
                           rf_tuned_result_train[7,]$value,
                           gbm_metrics_final_train[7,]$value,
                           dl_metrics_final_train[7,]$value)

pr_auc_results_train <- c(glm_tuned_result_train[8,]$value, 
                          rf_tuned_result_train[8,]$value,
                          gbm_metrics_final_train[8,]$value,
                          dl_metrics_final_train[8,]$value)

classification_results_train <- data.frame(model_type, 
                                           train_time, 
                                           accuracy_results_train, 
                                           recall_results_train,
                                           precision_results_train,
                                           missclassification_results_train,
                                           specificity_results_train,
                                           f1_results_train,
                                           roc_auc_results_train,
                                           pr_auc_results_train)

colnames(classification_results_train) <- c("model type", 
                                            "train time", 
                                            "accuracy", 
                                            "recall",
                                            "precision",
                                            "missclassification",
                                            "specificity",
                                            "f1",
                                            "roc_auc",
                                            "pr_auc")
classification_results_train %>% 
  kbl() %>%
  kable_minimal()
```

Summary of model metrics on the test set

```{r, class_results_spark_test, echo=FALSE}
accuracy_results_test <- c(glm_tuned_result[1,]$value, 
                           rf_tuned_result[1,]$value,
                           gbm_metrics_final_test[1,]$value,
                           dl_metrics_final_test[1,]$value)

recall_results_test <- c(glm_tuned_result[2,]$value, 
                         rf_tuned_result[2,]$value,
                         gbm_metrics_final_test[2,]$value,
                         dl_metrics_final_test[2,]$value)

precision_results_test <- c(glm_tuned_result[3,]$value, 
                            rf_tuned_result[3,]$value,
                            gbm_metrics_final_test[3,]$value,
                            dl_metrics_final_test[3,]$value)

missclassification_results_test <- c(glm_tuned_result[4,]$value, 
                                     rf_tuned_result[4,]$value,
                                     gbm_metrics_final_test[4,]$value,
                                     dl_metrics_final_test[4,]$value)

specificity_results_test <- c(glm_tuned_result[5,]$value, 
                              rf_tuned_result[5,]$value,
                              gbm_metrics_final_test[5,]$value,
                              dl_metrics_final_test[5,]$value)

f1_results_test <- c(glm_tuned_result[6,]$value, 
                     rf_tuned_result[6,]$value,
                     gbm_metrics_final_test[6,]$value,
                     dl_metrics_final_test[6,]$value)

roc_auc_results_test <- c(glm_tuned_result[7,]$value, 
                          rf_tuned_result[7,]$value,
                          gbm_metrics_final_test[7,]$value,
                          dl_metrics_final_test[7,]$value)

pr_auc_results_test <- c(glm_tuned_result[8,]$value, 
                         rf_tuned_result[8,]$value,
                         gbm_metrics_final_test[8,]$value,
                         dl_metrics_final_test[8,]$value)

classification_results_train <- data.frame(model_type, 
                                           accuracy_results_test,
                                           recall_results_test,
                                           precision_results_test,
                                           missclassification_results_test,
                                           specificity_results_test,
                                           f1_results_test,
                                           roc_auc_results_test,
                                           pr_auc_results_test)
colnames(classification_results_train) <- c("model",
                                            "accuracy", 
                                            "recall",
                                            "precision",
                                            "missclassification",
                                            "specificity",
                                            "f1",
                                            "roc_auc",
                                            "pr_auc")
classification_results_train %>% 
  kbl() %>%
  kable_minimal()
markdown_end_time <- Sys.time()
```

The total time to knit this markdown is:

```{r, total_knit_time}
markdown_end_time - markdown_start_time
```