Skip to content

Commit

Permalink
Jitghosh/parallelbatchscore (#289)
Browse files Browse the repository at this point in the history
* Batch Scoring First Draft: Added pipeline creation and run scripts, scoring script, new environment variables, changes to env loading script, compute creation and AML environment creation scripts, and new Azure pipeline for batch scoring CI

* Score copy step added

* Modified bootstrap.py, updated getting started doc

* Addressed PR comments

* Addressed PR comments

* Doc fix

* Doc fix
  • Loading branch information
jitghosh authored Jun 22, 2020
1 parent 9056285 commit bcdac5c
Show file tree
Hide file tree
Showing 19 changed files with 1,311 additions and 95 deletions.
38 changes: 33 additions & 5 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,30 +1,33 @@
# Azure Subscription Variables
SUBSCRIPTION_ID = ''
LOCATION = 'westeurope'
LOCATION = ''
TENANT_ID = ''
BASE_NAME = ''
SP_APP_ID = ''
SP_APP_SECRET = ''
RESOUCE_GROUP = 'mlops-rg'
RESOURCE_GROUP = 'mlops-RG'

# Mock build/release ID for local testing
BUILD_BUILDID = '001'

# Azure ML Workspace Variables
WORKSPACE_NAME = 'aml-workspace'
EXPERIMENT_NAME = ''
WORKSPACE_NAME = 'mlops-aml-ws'
EXPERIMENT_NAME = 'mlopspython'

# AML Compute Cluster Config
AML_ENV_NAME='diabetes_regression_training_env'
AML_ENV_TRAIN_CONDA_DEP_FILE="conda_dependencies.yml"
AML_COMPUTE_CLUSTER_NAME = 'train-cluster'
AML_COMPUTE_CLUSTER_CPU_SKU = 'STANDARD_DS2_V2'
AML_CLUSTER_MAX_NODES = '4'
AML_CLUSTER_MIN_NODES = '0'
AML_CLUSTER_PRIORITY = 'lowpriority'
# Training Config
MODEL_NAME = 'sklearn_regression_model.pkl'
MODEL_NAME = 'diabetes_regression_model.pkl'
MODEL_VERSION = '1'
TRAIN_SCRIPT_PATH = 'training/train.py'


# AML Pipeline Config
TRAINING_PIPELINE_NAME = 'Training Pipeline'
MODEL_PATH = ''
Expand All @@ -51,3 +54,28 @@ ALLOW_RUN_CANCEL = 'true'

# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
AML_REBUILD_ENVIRONMENT = 'false'



USE_GPU_FOR_SCORING = "false"
AML_ENV_SCORE_CONDA_DEP_FILE="conda_dependencies_scoring.yml"
AML_ENV_SCORECOPY_CONDA_DEP_FILE="conda_dependencies_scorecopy.yml"
# AML Compute Cluster Config for parallel batch scoring
AML_ENV_NAME_SCORING='diabetes_regression_scoring_env'
AML_ENV_NAME_SCORE_COPY='diabetes_regression_score_copy_env'
AML_COMPUTE_CLUSTER_NAME_SCORING = 'score-cluster'
AML_COMPUTE_CLUSTER_CPU_SKU_SCORING = 'STANDARD_DS2_V2'
AML_CLUSTER_MAX_NODES_SCORING = '4'
AML_CLUSTER_MIN_NODES_SCORING = '0'
AML_CLUSTER_PRIORITY_SCORING = 'lowpriority'
AML_REBUILD_ENVIRONMENT_SCORING = 'true'
BATCHSCORE_SCRIPT_PATH = 'scoring/parallel_batchscore.py'
BATCHSCORE_COPY_SCRIPT_PATH = 'scoring/parallel_batchscore_copyoutput.py'


SCORING_DATASTORE_INPUT_CONTAINER = 'input'
SCORING_DATASTORE_INPUT_FILENAME = 'diabetes_scoring_input.csv'
SCORING_DATASTORE_OUTPUT_CONTAINER = 'output'
SCORING_DATASTORE_OUTPUT_FILENAME = 'diabetes_scoring_output.csv'
SCORING_DATASET_NAME = 'diabetes_scoring_ds'
SCORING_PIPELINE_NAME = 'diabetes-scoring-pipeline'
64 changes: 64 additions & 0 deletions .pipelines/diabetes_regression-batchscoring-ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Continuous Integration (CI) pipeline that orchestrates the batch scoring of the diabetes_regression model.

resources:
containers:
- container: mlops
image: mcr.microsoft.com/mlops/python:latest


pr: none
trigger:
branches:
include:
- master
paths:
include:
- diabetes_regression/scoring/parallel_batchscore.py
- ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py
- ml_service/pipelines/run_parallel_batchscore_pipeline.py

variables:
- template: diabetes_regression-variables-template.yml
- group: devopsforai-aml-vg

pool:
vmImage: ubuntu-latest

stages:
- stage: 'Batch_Scoring_Pipeline_CI'
displayName: 'Batch Scoring Pipeline CI'
jobs:
- job: "Build_Batch_Scoring_Pipeline"
displayName: "Build Batch Scoring Pipeline"
container: mlops
timeoutInMinutes: 0
steps:
- template: code-quality-template.yml
- task: AzureCLI@1
name: publish_batchscore
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
scriptLocation: inlineScript
workingDirectory: $(Build.SourcesDirectory)
inlineScript: |
set -e # fail on error
export SUBSCRIPTION_ID=$(az account show --query id -o tsv)
# Invoke the Python building and publishing a training pipeline
python -m ml_service.pipelines.diabetes_regression_build_parallel_batchscore_pipeline
- job: "Run_Batch_Score_Pipeline"
displayName: "Run Batch Scoring Pipeline"
dependsOn: "Build_Batch_Scoring_Pipeline"
timeoutInMinutes: 240
pool: server
variables:
pipeline_id: $[ dependencies.Build_Batch_Scoring_Pipeline.outputs['publish_batchscore.pipeline_id']]
steps:
- task: ms-air-aiagility.vss-services-azureml.azureml-restApi-task.MLPublishedPipelineRestAPITask@0
displayName: 'Invoke Batch Scoring pipeline'
inputs:
azureSubscription: '$(WORKSPACE_SVC_CONNECTION)'
PipelineId: '$(pipeline_id)'
ExperimentName: '$(EXPERIMENT_NAME)'
PipelineParameters: '"ParameterAssignments": {"model_name": "$(MODEL_NAME)"}'

62 changes: 62 additions & 0 deletions .pipelines/diabetes_regression-variables-template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ variables:
# The path to the model scoring script relative to SOURCES_DIR_TRAIN
- name: SCORE_SCRIPT
value: scoring/score.py


# Azure ML Variables
- name: EXPERIMENT_NAME
Expand All @@ -35,6 +36,8 @@ variables:
# AML Compute Cluster Config
- name: AML_ENV_NAME
value: diabetes_regression_training_env
- name: AML_ENV_TRAIN_CONDA_DEP_FILE
value: "conda_dependencies.yml"
- name: AML_COMPUTE_CLUSTER_CPU_SKU
value: STANDARD_DS2_V2
- name: AML_COMPUTE_CLUSTER_NAME
Expand Down Expand Up @@ -69,3 +72,62 @@ variables:
# Flag to allow rebuilding the AML Environment after it was built for the first time. This enables dependency updates from conda_dependencies.yaml.
# - name: AML_REBUILD_ENVIRONMENT
# value: "false"

# Variables below are used for controlling various aspects of batch scoring
- name: USE_GPU_FOR_SCORING
value: False
# Conda dependencies for the batch scoring step
- name: AML_ENV_SCORE_CONDA_DEP_FILE
value: "conda_dependencies_scoring.yml"
# Conda dependencies for the score copying step
- name: AML_ENV_SCORECOPY_CONDA_DEP_FILE
value: "conda_dependencies_scorecopy.yml"
# AML Compute Cluster Config for parallel batch scoring
- name: AML_ENV_NAME_SCORING
value: diabetes_regression_scoring_env
- name: AML_ENV_NAME_SCORE_COPY
value: diabetes_regression_score_copy_env
- name: AML_COMPUTE_CLUSTER_CPU_SKU_SCORING
value: STANDARD_DS2_V2
- name: AML_COMPUTE_CLUSTER_NAME_SCORING
value: score-cluster
- name: AML_CLUSTER_MIN_NODES_SCORING
value: 0
- name: AML_CLUSTER_MAX_NODES_SCORING
value: 4
- name: AML_CLUSTER_PRIORITY_SCORING
value: lowpriority
# The path to the batch scoring script relative to SOURCES_DIR_TRAIN
- name: BATCHSCORE_SCRIPT_PATH
value: scoring/parallel_batchscore.py
- name: BATCHSCORE_COPY_SCRIPT_PATH
value: scoring/parallel_batchscore_copyoutput.py
# Flag to allow rebuilding the AML Environment after it was built for the first time.
# This enables dependency updates from the conda dependencies yaml for scoring activities.
- name: AML_REBUILD_ENVIRONMENT_SCORING
value: "true"

# Datastore config for scoring
# The storage account name and key are supplied as variables in a variable group
# in the Azure Pipelines library for this project. Please refer to repo docs for
# more details

# Blob container where the input data for scoring can be found
- name: SCORING_DATASTORE_INPUT_CONTAINER
value: "input"
# Blobname for the input data - include any applicable path in the string
- name: SCORING_DATASTORE_INPUT_FILENAME
value: "diabetes_scoring_input.csv"
# Blob container where the output data for scoring can be found
- name: SCORING_DATASTORE_OUTPUT_CONTAINER
value: "output"
# Blobname for the output data - include any applicable path in the string
- name: SCORING_DATASTORE_OUTPUT_FILENAME
value: "diabetes_scoring_output.csv"
# Dataset name for input data for scoring
- name: SCORING_DATASET_NAME
value: "diabetes_scoring_ds"
# Scoring pipeline name
- name: SCORING_PIPELINE_NAME
value: "diabetes-scoring-pipeline"

2 changes: 2 additions & 0 deletions bootstrap/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,12 @@ def replace_project_name(project_dir, project_name, rename_name):
r".pipelines/diabetes_regression-ci.yml",
r".pipelines/abtest.yml",
r".pipelines/diabetes_regression-ci-image.yml",
r".pipelines/diabetes_regression-batchscoring-ci.yml",
r".pipelines/diabetes_regression-get-model-version-template.yml", # NOQA: E501
r".pipelines/diabetes_regression-variables-template.yml",
r"environment_setup/Dockerfile",
r"environment_setup/install_requirements.sh",
r"ml_service/pipelines/diabetes_regression_build_parallel_batchscore_pipeline.py", # NOQA: E501
r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r_on_dbricks.py", # NOQA: E501
r"ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py", # NOQA: E501
r"ml_service/pipelines/diabetes_regression_build_train_pipeline.py", # NOQA: E501
Expand Down
31 changes: 31 additions & 0 deletions diabetes_regression/conda_dependencies_scorecopy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for managed runs. These include runs against
# the localdocker, remotedocker, and cluster compute targets.

# Note that this file is NOT used to automatically manage dependencies for the
# local compute target. To provision these dependencies locally, run:
# conda env update --file conda_dependencies.yml

# Details about the Conda environment file format:
# https://conda.io/docs/using/envs.html#create-environment-file-by-hand

# For managing Spark packages and configuration, see spark_dependencies.yml.
# Version of this configuration file's structure and semantics in AzureML.
# This directive is stored in a comment to preserve the Conda file structure.
# [AzureMlVersion] = 2

# These dependencies are used to create the environment used by the batch score
# copy pipeline step
name: diabetes_regression_score_copy_env
dependencies:
# The python interpreter version.
# Currently Azure ML Workbench only supports 3.5.2 and later.
- python=3.7.*
- pip

- pip:
# Base AzureML SDK
- azureml-sdk==1.6.*

# Score copying deps
- azure-storage-blob
32 changes: 32 additions & 0 deletions diabetes_regression/conda_dependencies_scoring.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for managed runs. These include runs against
# the localdocker, remotedocker, and cluster compute targets.

# Note that this file is NOT used to automatically manage dependencies for the
# local compute target. To provision these dependencies locally, run:
# conda env update --file conda_dependencies.yml

# Details about the Conda environment file format:
# https://conda.io/docs/using/envs.html#create-environment-file-by-hand

# For managing Spark packages and configuration, see spark_dependencies.yml.
# Version of this configuration file's structure and semantics in AzureML.
# This directive is stored in a comment to preserve the Conda file structure.
# [AzureMlVersion] = 2

# These dependencies are used to create the environment used by the batch score
# pipeline step
name: diabetes_regression_scoring_env
dependencies:
# The python interpreter version.
# Currently Azure ML Workbench only supports 3.5.2 and later.
- python=3.7.*
- pip

- pip:
# Base AzureML SDK
- azureml-sdk==1.6.*

# Scoring deps
- scikit-learn
- pandas
Loading

0 comments on commit bcdac5c

Please sign in to comment.