diff --git a/experimentation/Diabetes Ridge Regression Scoring.ipynb b/experimentation/Diabetes Ridge Regression Scoring.ipynb
index 9ac340ed..a33432fe 100644
--- a/experimentation/Diabetes Ridge Regression Scoring.ipynb
+++ b/experimentation/Diabetes Ridge Regression Scoring.ipynb
@@ -1,4 +1,4 @@
-{
+ {
"cells": [
{
"cell_type": "markdown",
@@ -39,6 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
+ "def init():\n",
"model_path = Model.get_model_path(model_name=\"sklearn_regression_model.pkl\")\n",
"model = joblib.load(model_path)"
]
@@ -56,10 +57,12 @@
"metadata": {},
"outputs": [],
"source": [
- "raw_data = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n",
+ "def run(raw_data, request_headers):\n",
+ "data = json.loads(raw_data)["data"]\n",
+ "data = numpy.array(data)\n",
+ "result = model.predict(data)\n",
"\n",
- "data = json.loads(raw_data)[\"data\"]\n",
- "data = numpy.array(data)"
+ "return {\"result\": result.tolist()}\n"
]
},
{
@@ -70,9 +73,22 @@
]
},
{
- "cell_type": "code",
+ "cell_type": "code",
"execution_count": 4,
"metadata": {},
+ "outputs": [],
+ "source": [
+ "test row = '{\"data\":[[1,2,3,4,5,6,7,8,9,10],[10,9,8,7,6,5,4,3,2,1]]}'\n",
+ "\n",
+ "request_headers = {}\n",
+ "prediction = run(test_row, {})\n",
+ "print("Test result: ", prediction)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
"outputs": [
{
"name": "stdout",
@@ -83,7 +99,6 @@
}
],
"source": [
- "request_headers = {}\n",
"\n",
"result = model.predict(data)\n",
"print(\"Test result: \", {\"result\": result.tolist()})"
diff --git a/experimentation/Diabetes Ridge Regression Training.ipynb b/experimentation/Diabetes Ridge Regression Training.ipynb
index fa192115..1ee2165a 100644
--- a/experimentation/Diabetes Ridge Regression Training.ipynb
+++ b/experimentation/Diabetes Ridge Regression Training.ipynb
@@ -28,358 +28,99 @@
"import pandas as pd"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Load Data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "sample_data = load_diabetes()\n",
- "\n",
- "df = pd.DataFrame(\n",
- " data=sample_data.data,\n",
- " columns=sample_data.feature_names)\n",
- "df['Y'] = sample_data.target"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "(442, 10)\n"
- ]
- }
- ],
- "source": [
- "print(df.shape)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " age | \n",
- " sex | \n",
- " bmi | \n",
- " bp | \n",
- " s1 | \n",
- " s2 | \n",
- " s3 | \n",
- " s4 | \n",
- " s5 | \n",
- " s6 | \n",
- " Y | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 4.420000e+02 | \n",
- " 442.000000 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " -3.634285e-16 | \n",
- " 1.308343e-16 | \n",
- " -8.045349e-16 | \n",
- " 1.281655e-16 | \n",
- " -8.835316e-17 | \n",
- " 1.327024e-16 | \n",
- " -4.574646e-16 | \n",
- " 3.777301e-16 | \n",
- " -3.830854e-16 | \n",
- " -3.412882e-16 | \n",
- " 152.133484 | \n",
- "
\n",
- " \n",
- " std | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 4.761905e-02 | \n",
- " 77.093005 | \n",
- "
\n",
- " \n",
- " min | \n",
- " -1.072256e-01 | \n",
- " -4.464164e-02 | \n",
- " -9.027530e-02 | \n",
- " -1.123996e-01 | \n",
- " -1.267807e-01 | \n",
- " -1.156131e-01 | \n",
- " -1.023071e-01 | \n",
- " -7.639450e-02 | \n",
- " -1.260974e-01 | \n",
- " -1.377672e-01 | \n",
- " 25.000000 | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " -3.729927e-02 | \n",
- " -4.464164e-02 | \n",
- " -3.422907e-02 | \n",
- " -3.665645e-02 | \n",
- " -3.424784e-02 | \n",
- " -3.035840e-02 | \n",
- " -3.511716e-02 | \n",
- " -3.949338e-02 | \n",
- " -3.324879e-02 | \n",
- " -3.317903e-02 | \n",
- " 87.000000 | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " 5.383060e-03 | \n",
- " -4.464164e-02 | \n",
- " -7.283766e-03 | \n",
- " -5.670611e-03 | \n",
- " -4.320866e-03 | \n",
- " -3.819065e-03 | \n",
- " -6.584468e-03 | \n",
- " -2.592262e-03 | \n",
- " -1.947634e-03 | \n",
- " -1.077698e-03 | \n",
- " 140.500000 | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " 3.807591e-02 | \n",
- " 5.068012e-02 | \n",
- " 3.124802e-02 | \n",
- " 3.564384e-02 | \n",
- " 2.835801e-02 | \n",
- " 2.984439e-02 | \n",
- " 2.931150e-02 | \n",
- " 3.430886e-02 | \n",
- " 3.243323e-02 | \n",
- " 2.791705e-02 | \n",
- " 211.500000 | \n",
- "
\n",
- " \n",
- " max | \n",
- " 1.107267e-01 | \n",
- " 5.068012e-02 | \n",
- " 1.705552e-01 | \n",
- " 1.320442e-01 | \n",
- " 1.539137e-01 | \n",
- " 1.987880e-01 | \n",
- " 1.811791e-01 | \n",
- " 1.852344e-01 | \n",
- " 1.335990e-01 | \n",
- " 1.356118e-01 | \n",
- " 346.000000 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " age sex bmi bp s1 \\\n",
- "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n",
- "mean -3.634285e-16 1.308343e-16 -8.045349e-16 1.281655e-16 -8.835316e-17 \n",
- "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n",
- "min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01 \n",
- "25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02 \n",
- "50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03 \n",
- "75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564384e-02 2.835801e-02 \n",
- "max 1.107267e-01 5.068012e-02 1.705552e-01 1.320442e-01 1.539137e-01 \n",
- "\n",
- " s2 s3 s4 s5 s6 \\\n",
- "count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 \n",
- "mean 1.327024e-16 -4.574646e-16 3.777301e-16 -3.830854e-16 -3.412882e-16 \n",
- "std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 \n",
- "min -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01 \n",
- "25% -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02 \n",
- "50% -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03 \n",
- "75% 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02 \n",
- "max 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01 \n",
- "\n",
- " Y \n",
- "count 442.000000 \n",
- "mean 152.133484 \n",
- "std 77.093005 \n",
- "min 25.000000 \n",
- "25% 87.000000 \n",
- "50% 140.500000 \n",
- "75% 211.500000 \n",
- "max 346.000000 "
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# All data in a single dataframe\n",
- "df.describe()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Split Data into Training and Validation Sets"
- ]
- },
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
+ "# Split the dataframe into test and train data\n",
+ "def = split_data(df):\n",
"X = df.drop('Y', axis=1).values\n",
"y = df['Y'].values\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=0)\n",
"data = {\"train\": {\"X\": X_train, \"y\": y_train},\n",
- " \"test\": {\"X\": X_test, \"y\": y_test}}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Train Model on Training Set"
+ " \"test\": {\"X\": X_test, \"y\": y_test}}\n",
+ " return data"
]
},
{
"cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "Ridge(alpha=0.5, copy_X=True, fit_intercept=True, max_iter=None,\n",
- " normalize=False, random_state=None, solver='auto', tol=0.001)"
- ]
- },
- "execution_count": 16,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# experiment parameters\n",
- "args = {\n",
- " \"alpha\": 0.5\n",
- "}\n",
- "\n",
- "reg_model = Ridge(**args)\n",
- "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])"
- ]
- },
- {
- "cell_type": "markdown",
+ "execution_count": 3,
"metadata": {},
+ "outputs": [],
"source": [
- "## Validate Model on Validation Set"
+ "# Train the model, return the model\n",
+ "def train_model(data, args):\n",
+ "reg_model = Ridge(**args)\n",
+ "reg_model.fit(data[\"train\"][\"X\"], data[\"train\"][\"y\"])\n",
+ " return reg_model"
]
- },
- {
+ },
+ {
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 4,
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "{'mse': 3298.9096058070622}\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
+ "# Evaluate the metrics for the model\n",
+ "def get_model_metrics(reg_model, data):\n",
"preds = reg_model.predict(data[\"test\"][\"X\"])\n",
"mse = mean_squared_error(preds, y_test)\n",
"metrics = {\"mse\": mse}\n",
- "print(metrics)"
+ " return metrics"
]
},
{
- "cell_type": "markdown",
+ "cell_type": "code",
+ "execution_count": 5,
"metadata": {},
+ "outputs": [],
"source": [
- "## Save Model"
+ "def main():\n",
+ "# Load data\n",
+ "sample_data = load_diabetes()\n",
+ "\n",
+ "df = pd.DataFrame(\n",
+ " data=sample_data.data,\n",
+ " columns=sample_data.feature_names)\n",
+ "df['Y'] = sample_data.target\n",
+ "\n",
+ "# Split Data into Training and Validation Sets\n",
+ " data = split_data(df):\n",
+ "\n",
+ "# Train Model on Training Set\n",
+ "args = {\n",
+ " \"alpha\": 0.5\n",
+ "}\n",
+ "\n",
+ "reg_model = Ridge(**args)\n",
+ "\n",
+ "# Validate Model on Validation Set\n",
+ "metrics = get_model_metrics(reg, data)\n",
+ "\n",
+ "# Save Model\n",
+ "model_name = \"sklearn_regression_model.pkl\"\n",
+ "\n",
+ "joblib.dump(value=reg, filename=model_name)"
]
},
- {
- "cell_type": "code",
+{
+"cell_type": "code",
"execution_count": 7,
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['sklearn_regression_model.pkl']"
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model_name = \"sklearn_regression_model.pkl\"\n",
+ "outputs": [],
+ "source": ["model_name = \"sklearn_regression_model.pkl\"\n",
"\n",
"joblib.dump(value=reg, filename=model_name)"
]
}
- ],
+ ],
"metadata": {
"kernelspec": {
- "display_name": "Python 3",
+ "display_name": "Python ",
"language": "python",
"name": "python3"
},
@@ -399,3 +140,5 @@
"nbformat": 4,
"nbformat_minor": 2
}
+
+