From cdb0e4a5d9a08900b08683c151fa1094841e51fd Mon Sep 17 00:00:00 2001 From: Gaurav Gupta <47334368+gaugup@users.noreply.github.com> Date: Sun, 27 Feb 2022 06:22:05 -0800 Subject: [PATCH] Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb (#1195) * Simplify the train pipeline responsibleaidashboard-census-classification-model-debugging.ipynb Signed-off-by: Gaurav Gupta * Address code review comments * Update notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb Co-authored-by: Roman Lutz Co-authored-by: Roman Lutz --- ...ensus-classification-model-debugging.ipynb | 38 +++++++++---------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb b/notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb index d149e84343..20c88878db 100644 --- a/notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb +++ b/notebooks/responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb @@ -79,7 +79,7 @@ "id": "clinical-henry", "metadata": {}, "source": [ - "First, load the census dataset and specify the different types of features. Then, clean the target feature values to include only 0 and 1." + "First, load the census dataset and specify the different types of features. Compose a pipeline which contains a preprocessor and estimator." ] }, { @@ -99,7 +99,7 @@ " y = dataset[[target_feature]]\n", " return X, y\n", "\n", - "def clean_data(X, y, target_feature):\n", + "def create_classification_pipeline(X, y, target_feature):\n", " features = X.columns.values.tolist()\n", " classes = y[target_feature].unique().tolist()\n", " pipe_cfg = {\n", @@ -118,9 +118,13 @@ " ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n", " ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])\n", " ])\n", - " X = feat_pipe.fit_transform(X)\n", - " print(pipe_cfg['cat_cols'])\n", - " return X, feat_pipe, features, classes\n", + "\n", + " # Append classifier to preprocessing pipeline.\n", + " # Now we have a full prediction pipeline.\n", + " pipeline = Pipeline(steps=[('preprocessor', feat_pipe),\n", + " ('model', LGBMClassifier())])\n", + "\n", + " return pipeline\n", "\n", "outdirname = 'responsibleai.12.28.21'\n", "try:\n", @@ -140,22 +144,17 @@ "train_data = pd.read_csv('adult-train.csv')\n", "test_data = pd.read_csv('adult-test.csv')\n", "\n", - "\n", "X_train_original, y_train = split_label(train_data, target_feature)\n", "X_test_original, y_test = split_label(test_data, target_feature)\n", "\n", + "pipeline = create_classification_pipeline(X_train_original, y_train, target_feature)\n", "\n", - "X_train, feat_pipe, features, classes = clean_data(X_train_original, y_train, target_feature)\n", "y_train = y_train[target_feature].to_numpy()\n", - "\n", - "X_test = feat_pipe.transform(X_test_original)\n", "y_test = y_test[target_feature].to_numpy()\n", "\n", - "train_data[target_feature] = y_train\n", - "test_data[target_feature] = y_test\n", "\n", - "test_data_sample = test_data.sample(n=500, random_state=5)\n", - "train_data_sample = train_data.sample(n=8000, random_state=5)" + "# Take 500 samples from the test data\n", + "test_data_sample = test_data.sample(n=500, random_state=5)" ] }, { @@ -163,7 +162,7 @@ "id": "potential-proportion", "metadata": {}, "source": [ - "Train a LightGBM classifier on the training data." + "Train the classification pipeline composed in the previous cell on the training data." ] }, { @@ -173,8 +172,7 @@ "metadata": {}, "outputs": [], "source": [ - "clf = LGBMClassifier()\n", - "model = clf.fit(X_train, y_train)" + "model = pipeline.fit(X_train_original, y_train)" ] }, { @@ -213,10 +211,8 @@ "metadata": {}, "outputs": [], "source": [ - "dashboard_pipeline = Pipeline(steps=[('preprocess', feat_pipe), ('model', model)])\n", - "\n", - "rai_insights = RAIInsights(dashboard_pipeline, train_data_sample, test_data_sample, target_feature, 'classification',\n", - " categorical_features=categorical_features)" + "rai_insights = RAIInsights(model, train_data, test_data_sample, target_feature, 'classification',\n", + " categorical_features=categorical_features)" ] }, { @@ -519,7 +515,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.7.11" } }, "nbformat": 4,