Jitghosh/parallelbatchscore (#289)

* Batch Scoring First Draft: Added pipeline creation and run scripts, scoring script, new environment variables, changes to env loading script, compute creation and AML environment creation scripts, and new Azure pipeline for batch scoring CI * Score copy step added * Modified bootstrap.py, updated getting started doc * Addressed PR comments * Addressed PR comments * Doc fix * Doc fix
microsoft · Jun 22, 2020 · bcdac5c · bcdac5c
1 parent 9056285
commit bcdac5c
Show file tree

Hide file tree

Showing 19 changed files with 1,311 additions and 95 deletions.
diff --git a/.env.example b/.env.example
@@ -1,30 +1,33 @@
 # Azure Subscription Variables
 SUBSCRIPTION_ID = ''
-LOCATION = 'westeurope'
+LOCATION = ''
 TENANT_ID = ''
 BASE_NAME = ''
 SP_APP_ID = ''
 SP_APP_SECRET = ''
-RESOUCE_GROUP = 'mlops-rg'
+RESOURCE_GROUP = 'mlops-RG'
 
 # Mock build/release ID for local testing
 BUILD_BUILDID = '001'
 
 # Azure ML Workspace Variables
-WORKSPACE_NAME = 'aml-workspace'
-EXPERIMENT_NAME = ''
+WORKSPACE_NAME = 'mlops-aml-ws'
+EXPERIMENT_NAME = 'mlopspython'
 
 # AML Compute Cluster Config
 AML_ENV_NAME='diabetes_regression_training_env'
+AML_ENV_TRAIN_CONDA_DEP_FILE="conda_dependencies.yml"
 AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
 AML_COMPUTE_CLUSTER_CPU_SKU = 'STANDARD_DS2_V2'
 AML_CLUSTER_MAX_NODES = '4'
 AML_CLUSTER_MIN_NODES = '0'
 AML_CLUSTER_PRIORITY = 'lowpriority'
 # Training Config
-MODEL_NAME = 'sklearn_regression_model.pkl'
+MODEL_NAME = 'diabetes_regression_model.pkl'
 MODEL_VERSION = '1'
 TRAIN_SCRIPT_PATH = 'training/train.py'
+
+
 # AML Pipeline Config
 TRAINING_PIPELINE_NAME = 'Training Pipeline'
 MODEL_PATH = ''
@@ -51,3 +54,28 @@ ALLOW_RUN_CANCEL = 'true'
 
 # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
 AML_REBUILD_ENVIRONMENT = 'false'
+
+
+
+USE_GPU_FOR_SCORING = "false"
+AML_ENV_SCORE_CONDA_DEP_FILE="conda_dependencies_scoring.yml"
+AML_ENV_SCORECOPY_CONDA_DEP_FILE="conda_dependencies_scorecopy.yml"
+# AML Compute Cluster Config for parallel batch scoring
+AML_ENV_NAME_SCORING='diabetes_regression_scoring_env'
+AML_ENV_NAME_SCORE_COPY='diabetes_regression_score_copy_env'
+AML_COMPUTE_CLUSTER_NAME_SCORING = 'score-cluster'
+AML_COMPUTE_CLUSTER_CPU_SKU_SCORING = 'STANDARD_DS2_V2'
+AML_CLUSTER_MAX_NODES_SCORING = '4'
+AML_CLUSTER_MIN_NODES_SCORING = '0'
+AML_CLUSTER_PRIORITY_SCORING = 'lowpriority'
+AML_REBUILD_ENVIRONMENT_SCORING = 'true'
+BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py'
+BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py'
+
+
+SCORING_DATASTORE_INPUT_CONTAINER = 'input'
+SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv'
+SCORING_DATASTORE_OUTPUT_CONTAINER = 'output'
+SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv'
+SCORING_DATASET_NAME = 'diabetes_scoring_ds'
+SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline'
diff --git a/.pipelines/diabetes_regression-batchscoring-ci.yml b/.pipelines/diabetes_regression-batchscoring-ci.yml
@@ -0,0 +1,64 @@
+# Continuous Integration (CI) pipeline that orchestrates the batch scoring of the diabetes_regression model.
+
+resources:
+  containers:
+  - container: mlops
+    image: mcr.microsoft.com/mlops/python:latest
+
+
+pr: none
+trigger:
+  branches:
+    include:
+    - master
+  paths:
+    include:
+    - diabetes_regression/scoring/parallel_batchscore.py
+    - ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py
+    - ml_service/pipelines/run_parallel_batchscore_pipeline.py
+
+variables:
+- template: diabetes_regression-variables-template.yml
+- group: devopsforai-aml-vg
+
+pool:
+  vmImage: ubuntu-latest
+
+stages:
+- stage: 'Batch_Scoring_Pipeline_CI'
+  displayName: 'Batch Scoring Pipeline CI'
+  jobs:
+  - job: "Build_Batch_Scoring_Pipeline"
+    displayName: "Build Batch Scoring Pipeline"
+    container: mlops
+    timeoutInMinutes: 0
+    steps:
+    - template: code-quality-template.yml
+    - task: AzureCLI@1
+      name: publish_batchscore
+      inputs:
+        azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'        
+        scriptLocation: inlineScript
+        workingDirectory: $(Build.SourcesDirectory)
+        inlineScript: |
+          set -e # fail on error
+          export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
+          # Invoke the Python building and publishing a training pipeline
+          python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline
+ 
+  - job: "Run_Batch_Score_Pipeline"
+    displayName: "Run Batch Scoring Pipeline"
+    dependsOn: "Build_Batch_Scoring_Pipeline"
+    timeoutInMinutes: 240
+    pool: server
+    variables:
+      pipeline_id: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['publish_batchscore.pipeline_id']]
+    steps:
+    - task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
+      displayName: 'Invoke Batch Scoring pipeline'
+      inputs:
+        azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
+        PipelineId: '$(pipeline_id)'
+        ExperimentName: '$(EXPERIMENT_NAME)'
+        PipelineParameters: '"ParameterAssignments": {"model_name": "$(MODEL_NAME)"}'
+
diff --git a/.pipelines/diabetes_regression-variables-template.yml b/.pipelines/diabetes_regression-variables-template.yml
@@ -16,6 +16,7 @@ variables:
     # The path to the model scoring script relative to SOURCES_DIR_TRAIN
   - name: SCORE_SCRIPT
     value: scoring/score.py
+
 
   # Azure ML Variables
   - name: EXPERIMENT_NAME
@@ -35,6 +36,8 @@ variables:
   # AML Compute Cluster Config
   - name: AML_ENV_NAME
     value: diabetes_regression_training_env
+  - name: AML_ENV_TRAIN_CONDA_DEP_FILE
+    value: "conda_dependencies.yml"
   - name: AML_COMPUTE_CLUSTER_CPU_SKU
     value: STANDARD_DS2_V2
   - name: AML_COMPUTE_CLUSTER_NAME
@@ -69,3 +72,62 @@ variables:
   # Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
   # - name: AML_REBUILD_ENVIRONMENT
   #  value: "false"
+
+  # Variables below are used for controlling various aspects of batch scoring
+  - name: USE_GPU_FOR_SCORING
+    value: False
+  # Conda dependencies for the batch scoring step
+  - name: AML_ENV_SCORE_CONDA_DEP_FILE
+    value: "conda_dependencies_scoring.yml"
+  # Conda dependencies for the score copying step
+  - name: AML_ENV_SCORECOPY_CONDA_DEP_FILE
+    value: "conda_dependencies_scorecopy.yml"
+    # AML Compute Cluster Config for parallel batch scoring
+  - name: AML_ENV_NAME_SCORING
+    value: diabetes_regression_scoring_env
+  - name: AML_ENV_NAME_SCORE_COPY
+    value: diabetes_regression_score_copy_env
+  - name: AML_COMPUTE_CLUSTER_CPU_SKU_SCORING
+    value: STANDARD_DS2_V2
+  - name: AML_COMPUTE_CLUSTER_NAME_SCORING
+    value: score-cluster
+  - name: AML_CLUSTER_MIN_NODES_SCORING
+    value: 0
+  - name: AML_CLUSTER_MAX_NODES_SCORING
+    value: 4
+  - name: AML_CLUSTER_PRIORITY_SCORING
+    value: lowpriority
+  # The path to the batch scoring script relative to SOURCES_DIR_TRAIN
+  - name: BATCHSCORE_SCRIPT_PATH
+    value: scoring/parallel_batchscore.py
+  - name: BATCHSCORE_COPY_SCRIPT_PATH
+    value: scoring/parallel_batchscore_copyoutput.py
+  # Flag to allow rebuilding the AML Environment after it was built for the first time. 
+  # This enables dependency updates from the conda dependencies yaml for scoring activities.
+  - name: AML_REBUILD_ENVIRONMENT_SCORING
+    value: "true"
+
+  # Datastore config for scoring
+  # The storage account name and key are supplied as variables in a variable group 
+  # in the Azure Pipelines library for this project. Please refer to repo docs for 
+  # more details
+
+  # Blob container where the input data for scoring can be found
+  - name: SCORING_DATASTORE_INPUT_CONTAINER
+    value: "input"
+  # Blobname for the input data - include any applicable path in the string 
+  - name: SCORING_DATASTORE_INPUT_FILENAME
+    value: "diabetes_scoring_input.csv"
+  # Blob container where the output data for scoring can be found
+  - name: SCORING_DATASTORE_OUTPUT_CONTAINER
+    value: "output"
+  # Blobname for the output data - include any applicable path in the string 
+  - name: SCORING_DATASTORE_OUTPUT_FILENAME
+    value: "diabetes_scoring_output.csv"
+  # Dataset name for input data for scoring
+  - name: SCORING_DATASET_NAME
+    value: "diabetes_scoring_ds"
+  # Scoring pipeline name
+  - name: SCORING_PIPELINE_NAME
+    value: "diabetes-scoring-pipeline"
+
diff --git a/bootstrap/bootstrap.py b/bootstrap/bootstrap.py
@@ -87,10 +87,12 @@ def replace_project_name(project_dir, project_name, rename_name):
             r".pipelines/diabetes_regression-ci.yml",
             r".pipelines/abtest.yml",
             r".pipelines/diabetes_regression-ci-image.yml",
+            r".pipelines/diabetes_regression-batchscoring-ci.yml",
             r".pipelines/diabetes_regression-get-model-version-template.yml",  # NOQA: E501
             r".pipelines/diabetes_regression-variables-template.yml",
             r"environment_setup/Dockerfile",
             r"environment_setup/install_requirements.sh",
+            r"ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py",  # NOQA: E501
             r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py",  # NOQA: E501
             r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py",  # NOQA: E501
             r"ml_service/pipelines/diabetes_regression_build_train_pipeline.py",  # NOQA: E501

diff --git a/diabetes_regression/conda_dependencies_scorecopy.yml b/diabetes_regression/conda_dependencies_scorecopy.yml
@@ -0,0 +1,31 @@
+# Conda environment specification. The dependencies defined in this file will
+# be automatically provisioned for managed runs. These include runs against
+# the localdocker, remotedocker, and cluster compute targets.
+
+# Note that this file is NOT used to automatically manage dependencies for the
+# local compute target. To provision these dependencies locally, run:
+# conda env update --file conda_dependencies.yml
+
+# Details about the Conda environment file format:
+# https://conda.io/docs/using/envs.html#create-environment-file-by-hand
+
+# For managing Spark packages and configuration, see spark_dependencies.yml.
+# Version of this configuration file's structure and semantics in AzureML.
+# This directive is stored in a comment to preserve the Conda file structure.
+# [AzureMlVersion] = 2
+
+# These dependencies are used to create the environment used by the batch score 
+# copy pipeline step
+name: diabetes_regression_score_copy_env
+dependencies:
+  # The python interpreter version.
+  # Currently Azure ML Workbench only supports 3.5.2 and later.
+  - python=3.7.*
+  - pip
+
+  - pip:
+      # Base AzureML SDK
+      - azureml-sdk==1.6.*
+
+      # Score copying deps
+      - azure-storage-blob
diff --git a/diabetes_regression/conda_dependencies_scoring.yml b/diabetes_regression/conda_dependencies_scoring.yml
@@ -0,0 +1,32 @@
+# Conda environment specification. The dependencies defined in this file will
+# be automatically provisioned for managed runs. These include runs against
+# the localdocker, remotedocker, and cluster compute targets.
+
+# Note that this file is NOT used to automatically manage dependencies for the
+# local compute target. To provision these dependencies locally, run:
+# conda env update --file conda_dependencies.yml
+
+# Details about the Conda environment file format:
+# https://conda.io/docs/using/envs.html#create-environment-file-by-hand
+
+# For managing Spark packages and configuration, see spark_dependencies.yml.
+# Version of this configuration file's structure and semantics in AzureML.
+# This directive is stored in a comment to preserve the Conda file structure.
+# [AzureMlVersion] = 2
+
+# These dependencies are used to create the environment used by the batch score 
+# pipeline step
+name: diabetes_regression_scoring_env
+dependencies:
+  # The python interpreter version.
+  # Currently Azure ML Workbench only supports 3.5.2 and later.
+  - python=3.7.*
+  - pip
+
+  - pip:
+      # Base AzureML SDK
+      - azureml-sdk==1.6.*
+
+      # Scoring deps
+      - scikit-learn
+      - pandas