From 96bfc226628a5f0e58c52d2f0e4f76b76e3bf2e9 Mon Sep 17 00:00:00 2001
From: Brian Hoang <brhoan@microsoft.com>
Date: Tue, 18 Jun 2019 21:04:30 -0700
Subject: [PATCH] Fix: Upgrading to R Batch SDK to 2018-12-01.8.0 (#354)

* Added resource files

* Added resource files

* Removed comments

* Fixed resource files documentation

* Added check on job state

* Fixed jobState
---
 R/batch-api.R                                 | 27 ++++++++--------
 R/cluster.R                                   |  2 +-
 R/doAzureParallel.R                           | 32 ++++++++++++++-----
 R/utility-job.R                               | 23 ++++---------
 docs/71-distributing-data.md                  |  8 ++---
 .../resource_files/resource_files_example.R   | 12 +++----
 .../sas_resources_files_example.R             |  4 +--
 7 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/R/batch-api.R b/R/batch-api.R
index 235b4663..7d7f8de3 100644
--- a/R/batch-api.R
+++ b/R/batch-api.R
@@ -19,7 +19,7 @@ BatchUtilities <- R6::R6Class(
 
       accountName <- storageClient$authentication$name
 
-      resourceFiles <- NULL
+      resourceFiles <- args$resourceFiles
       if (!is.null(argsList)) {
         envFile <- paste0(taskId, ".rds")
         saveRDS(argsList, file = envFile)
@@ -37,8 +37,18 @@ BatchUtilities <- R6::R6Class(
             envFile,
             readToken,
             config$endpointSuffix)
-        resourceFiles <-
-          list(rAzureBatch::createResourceFile(url = envFileUrl, fileName = envFile))
+
+        environmentResourceFile <-
+          rAzureBatch::createResourceFile(filePath = envFile, httpUrl = envFileUrl)
+
+        if (is.null(resourceFiles))
+        {
+          resourceFiles <-
+            list(environmentResourceFile)
+        }
+        else {
+          resourceFiles <- append(resourceFiles, environmentResourceFile)
+        }
       }
 
       # Only use the download command if cloudCombine is enabled
@@ -52,17 +62,6 @@ BatchUtilities <- R6::R6Class(
 
       if (!is.null(cloudCombine)) {
         assign("cloudCombine", cloudCombine, .doAzureBatchGlobals)
-        containerSettings$imageName <- "brianlovedocker/doazureparallel-merge-dockerfile:0.12.1"
-
-        copyCommand <- sprintf(
-          "%s %s %s --download --saskey $BLOBXFER_SASKEY --remoteresource . --include results/*.rds --endpoint %s",
-          accountName,
-          jobId,
-          "$AZ_BATCH_TASK_WORKING_DIR",
-          config$endpointSuffix
-        )
-
-        commands <- c(paste("blobxfer", copyCommand))
       }
 
       exitConditions <- NULL
diff --git a/R/cluster.R b/R/cluster.R
index 0651d770..3536fad5 100644
--- a/R/cluster.R
+++ b/R/cluster.R
@@ -123,7 +123,7 @@ makeCluster <-
 
     # install docker
     containerConfiguration <- list(
-      type = "docker"
+      type = "dockerCompatible"
     )
 
     dockerImage <- "rocker/tidyverse:latest"
diff --git a/R/doAzureParallel.R b/R/doAzureParallel.R
index 4302b76e..aedf7da4 100644
--- a/R/doAzureParallel.R
+++ b/R/doAzureParallel.R
@@ -474,12 +474,12 @@ setHttpTraffic <- function(value = FALSE) {
                                  storageEndpointSuffix = config$endpointSuffix)
 
     requiredJobResourceFiles <- list(
-      rAzureBatch::createResourceFile(url = workerScriptUrl, fileName = "worker.R"),
-      rAzureBatch::createResourceFile(url = mergerScriptUrl, fileName = "merger.R"),
-      rAzureBatch::createResourceFile(url = installGithubScriptUrl, fileName = "install_github.R"),
-      rAzureBatch::createResourceFile(url = installCranScriptUrl, fileName = "install_cran.R"),
-      rAzureBatch::createResourceFile(url = installBioConductorScriptUrl, fileName = "install_bioconductor.R"),
-      rAzureBatch::createResourceFile(url = jobCommonFileUrl, fileName = jobFileName)
+      rAzureBatch::createResourceFile(filePath = "worker.R", httpUrl = workerScriptUrl),
+      rAzureBatch::createResourceFile(filePath = "merger.R", httpUrl = mergerScriptUrl),
+      rAzureBatch::createResourceFile(filePath = "install_github.R", httpUrl = installGithubScriptUrl),
+      rAzureBatch::createResourceFile(filePath = "install_cran.R", httpUrl = installCranScriptUrl),
+      rAzureBatch::createResourceFile(filePath = "install_bioconductor.R", httpUrl = installBioConductorScriptUrl),
+      rAzureBatch::createResourceFile(filePath = jobFileName, httpUrl = jobCommonFileUrl)
     )
 
     resourceFiles <-
@@ -669,6 +669,21 @@ setHttpTraffic <- function(value = FALSE) {
       )
     )
 
+    mergeReadSasToken <- storageClient$generateSasToken("rl", "c", id)
+    mergeResourceFileUrl <-
+      rAzureBatch::createBlobUrl(
+        storageAccount = storageClient$authentication$name,
+        containerName = id,
+        sasToken = mergeReadSasToken,
+        storageEndpointSuffix = config$endpointSuffix
+      )
+
+    mergeResources <-
+      list(
+        rAzureBatch::createResourceFile(
+          storageContainerUrl = mergeResourceFileUrl,
+          blobPrefix = "results"))
+
     BatchUtilitiesOperations$addTask(
       jobId = id,
       taskId = "merge",
@@ -684,7 +699,8 @@ setHttpTraffic <- function(value = FALSE) {
       dependsOn = taskDependencies,
       cloudCombine = cloudCombine,
       outputFiles = append(obj$options$azure$outputFiles, mergeOutput),
-      containerImage = data$containerImage
+      containerImage = data$containerImage,
+      resourceFiles = mergeResources
     )
 
     cat(". . .")
@@ -726,7 +742,7 @@ setHttpTraffic <- function(value = FALSE) {
           }
 
           if (!identical(function(a, ...) c(a, list(...)),
-                        obj$combineInfo$fun, ignore.environment = TRUE)){
+                        obj$combineInfo$fun, ignore.environment = TRUE)) {
             tryCatch({
               accumulator <- foreach::makeAccum(it)
               accumulator(results, as.numeric(names(results)))
diff --git a/R/utility-job.R b/R/utility-job.R
index dfe0398d..17886326 100644
--- a/R/utility-job.R
+++ b/R/utility-job.R
@@ -472,19 +472,14 @@ waitForTasksToComplete <-
 
       flush.console()
 
-      validationFlag <-
-        (taskCounts$validationStatus == "Validated" &&
-           totalTasks <= 200000) ||
-        totalTasks > 200000
-
       if (taskCounts$failed > 0 &&
-          errorHandling == "stop" &&
-          validationFlag) {
+          errorHandling == "stop") {
         cat("\n")
 
         select <- "id, executionInfo"
+        filter <- "executionInfo/result	eq 'failure'"
         failedTasks <-
-          batchClient$taskOperations$list(jobId, select = select)
+          batchClient$taskOperations$list(jobId, select = select, filter = filter)
 
         tasksFailureWarningLabel <-
           sprintf(
@@ -498,14 +493,9 @@ waitForTasksToComplete <-
           )
 
         for (i in 1:length(failedTasks$value)) {
-          if (!is.null(failedTasks$value[[i]]$executionInfo$result) &&
-              grepl(failedTasks$value[[i]]$executionInfo$result,
-                    "failure",
-                    ignore.case = TRUE)) {
             tasksFailureWarningLabel <-
               paste0(tasksFailureWarningLabel,
                      sprintf("%s\n", failedTasks$value[[i]]$id))
-          }
         }
 
         warning(sprintf(tasksFailureWarningLabel,
@@ -533,9 +523,10 @@ waitForTasksToComplete <-
         jobId)
       }
 
-      if (taskCounts$completed >= totalTasks &&
-          (taskCounts$validationStatus == "Validated" ||
-           totalTasks >= 200000)) {
+      jobInfo <- getJob(jobId, verbose = FALSE)
+      if (taskCounts$completed >= totalTasks ||
+          jobInfo$jobState == "completed" ||
+          jobInfo$jobState == "terminating") {
         cat("\n")
         break
       }
diff --git a/docs/71-distributing-data.md b/docs/71-distributing-data.md
index 3546c6bd..081c2c88 100644
--- a/docs/71-distributing-data.md
+++ b/docs/71-distributing-data.md
@@ -39,12 +39,12 @@ Here's an example that uses data stored in a public location on Azure Blob Stora
 # define where to download data from
 resource_files = list(
     rAzureBatch::createResourceFile(
-        url = "https://<accountname>.blob.core.windows.net/<container>/2010.csv",
-        fileName = "2010.csv"
+        httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2010.csv",
+        filePath = "2010.csv"
     ),
     rAzureBatch::createResourceFile(
-        url = "https://<accountname>.blob.core.windows.net/<container>/2011.csv",
-        fileName = "2011.csv"
+        httpUrl = "https://<accountname>.blob.core.windows.net/<container>/2011.csv",
+        filePath = "2011.csv"
     )
 )
 
diff --git a/samples/resource_files/resource_files_example.R b/samples/resource_files/resource_files_example.R
index 37fa6ac9..b3544c81 100644
--- a/samples/resource_files/resource_files_example.R
+++ b/samples/resource_files/resource_files_example.R
@@ -34,12 +34,12 @@ doAzureParallel::setCredentials("credentials.json")
 # Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
 azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
 resource_files <- list(
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), fileName = "yellow_tripdata_2016-1.csv"),
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), fileName = "yellow_tripdata_2016-2.csv"),
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), fileName = "yellow_tripdata_2016-3.csv"),
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), fileName = "yellow_tripdata_2016-4.csv"),
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), fileName = "yellow_tripdata_2016-5.csv"),
-  rAzureBatch::createResourceFile(url = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), fileName = "yellow_tripdata_2016-6.csv")
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"),
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"),
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"),
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"),
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"),
+  rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv")
 )
 
 # add the parameter 'resourceFiles' to download files to nodes
diff --git a/samples/sas_resource_files/sas_resources_files_example.R b/samples/sas_resource_files/sas_resources_files_example.R
index adf3da23..37ee93a0 100644
--- a/samples/sas_resource_files/sas_resources_files_example.R
+++ b/samples/sas_resource_files/sas_resources_files_example.R
@@ -56,8 +56,8 @@ csvFileUrl2 <- rAzureBatch::createBlobUrl(storageAccount = storageAccountName,
 # Create a list of files to download to the cluster using read-only permissions
 # Place the files in a directory called 'data'
 resource_files = list(
-  rAzureBatch::createResourceFile(url = csvFileUrl1, fileName = "data/1989.csv"),
-  rAzureBatch::createResourceFile(url = csvFileUrl2, fileName = "data/1990.csv")
+  rAzureBatch::createResourceFile(httpUrl = csvFileUrl1, filePath = "data/1989.csv"),
+  rAzureBatch::createResourceFile(httpUrl = csvFileUrl2, filePath = "data/1990.csv")
 )
 
 # Create the cluster