NVIDIA · mattahrens · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/user_tools/pyproject.toml b/user_tools/pyproject.toml
@@ -46,6 +46,8 @@ dependencies = [
     "progress==1.6",
     # used for model estimations
     "xgboost==2.0.3",
+    # used for model interpretability
+    "shap==0.44.1",
     # used for retrieving available memory on the host
     "psutil==5.9.8"
 ]

diff --git a/user_tools/src/spark_rapids_pytools/resources/qualification-conf.yaml b/user_tools/src/spark_rapids_pytools/resources/qualification-conf.yaml
@@ -260,6 +260,8 @@ local:
           name: 'per_sql.csv'
         perApp:
           name: 'per_app.csv'
+        shapValues:
+          name: 'shap_values.csv'
       updateResult:
         subsetColumns:
           - 'appId'

diff --git a/user_tools/src/spark_rapids_tools/tools/model_xgboost.py b/user_tools/src/spark_rapids_tools/tools/model_xgboost.py
@@ -26,6 +26,7 @@
 import numpy as np
 import pandas as pd
 import xgboost as xgb
+import shap
 from tabulate import tabulate
 from xgboost.core import XGBoostError
 
@@ -1188,6 +1189,7 @@ def predict_model(
         cpu_aug_tbl: pd.DataFrame,
         feature_cols: List[str],
         label_col: str,
+        output_info: Optional[dict] = None,
 ) -> pd.DataFrame:
     """Use model to predict on feature data."""
     model_features = xgb_model.feature_names
@@ -1205,6 +1207,20 @@ def predict_model(
     dmat = xgb.DMatrix(x_dim, y_dim)
     y_pred = xgb_model.predict(dmat)
 
+    # shapley explainer for prediction
+    pd.set_option('display.max_rows', None)
+    explainer = shap.TreeExplainer(xgb_model)
+    shap_values = explainer.shap_values(x_dim)
+    shap_vals = np.abs(shap_values).mean(axis=0)
+    feature_importance = pd.DataFrame(
+            list(zip(feature_cols, shap_vals)), columns=['feature', 'shap_value']
+    )
+    feature_importance.sort_values(by=['shap_value'], ascending=False, inplace=True)
+    shap_values_path = output_info['shapValues']['path']
+    logger.info('Writing SHAPley values to: %s', shap_values_path)
+    feature_importance.to_csv(shap_values_path, index=False)
+    logger.info('Feature importance (SHAPley values)\n %s', feature_importance)
+
     if y_dim is not None:
         # evaluation
         if LOG_LABEL:
@@ -1430,7 +1446,7 @@ def predict(platform: str = 'onprem',
             features, feature_cols, label_col = extract_model_features(input_df)
             # note: dataset name is already stored in the 'appName' field
             try:
-                results = predict_model(xgb_model, features, feature_cols, label_col)
+                results = predict_model(xgb_model, features, feature_cols, label_col, output_info)
 
                 # compute per-app speedups
                 summary = _compute_summary(results)