microsoft · gaugup · Nov 18, 2022 · Sep 29, 2022 · Oct 3, 2022 · Oct 5, 2022
diff --git a/.eslintrc/.eslintrc.custom.eslintrc b/.eslintrc/.eslintrc.custom.eslintrc
@@ -77,6 +77,7 @@
           "feature_value",
           "global_effects",
           "identity_feature_name",
+          "is_large_data_scenario",
           "local_effects",
           "local_importance",
           "local_policies",
@@ -130,6 +131,7 @@
           "true_positive_rate_ratio",
           "true_y",
           "upper_bounds",
+          "use_entire_test_data",
           "zero_one_loss"
         ]
       }

diff --git a/.eslintrc/.eslintrc.import.eslintrc b/.eslintrc/.eslintrc.import.eslintrc
@@ -10,6 +10,7 @@
           "regenerator-runtime/runtime.js",
           "highcharts/modules/accessibility",
           "highcharts/modules/exporting",
+          "highcharts/modules/export-data",
           "highcharts/highcharts-more",
           "highcharts/modules/heatmap",
           "highcharts/modules/pattern-fill",

@@ -60,8 +60,8 @@ def compute_json_matrix(analyzer, features, filters, composite_filters):
     return compute_matrix(analyzer, features, filters, composite_filters)
 
 
-def compute_matrix(analyzer, features, filters, composite_filters,
-                   quantile_binning=False, num_bins=BIN_THRESHOLD):
+def compute_matrix_on_dataset(analyzer, features, dataset,
+                              quantile_binning=False, num_bins=BIN_THRESHOLD):
     """Compute a matrix of metrics for a given set of feature names.
 
     The filters and composite filters are used to filter the data
@@ -71,73 +71,36 @@ def compute_matrix(analyzer, features, filters, composite_filters,
     :type analyzer: BaseAnalyzer
     :param features: A list of one or two feature names to compute metrics for.
     :type features: list
-    :param filters: A list of filters to apply to the data.
-    :type filters: list
-    :param composite_filters: A list of composite filters to apply to the data.
-    :type composite_filters: list
+    :param dataset: The dataset on which matrix view needs to be computed.
+        The dataset should have the feature columns and the columns
+        'true_y' and 'index'. The 'true_y' column should have the true
+        target values corresponding to the test data. The 'index'
+        column should have the indices. If the analyzer is of type
+        PredictionsAnalyzer, then the dataset should include the column
+        'pred_y' which will hold the predictions.
+    :type dataset: pd.DataFrame
     :param quantile_binning: Whether to use quantile binning.
     :type quantile_binning: bool
     :param num_bins: The number of bins to use for quantile binning.
     :type num_bins: int
     :return: A dictionary representation of the computed matrix which can be
         saved to JSON.
     :rtype: dict
-
-    :Example:
-
-    An example of running compute_matrix with a filter and a composite
-    filter:
-
-    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
-    >>> from erroranalysis._internal.matrix_filter import (
-    ...     compute_matrix)
-    >>> from erroranalysis._internal.constants import ModelTask
-    >>> from sklearn.datasets import load_breast_cancer
-    >>> from sklearn.model_selection import train_test_split
-    >>> from sklearn import svm
-    >>> breast_cancer_data = load_breast_cancer()
-    >>> feature_names = breast_cancer_data.feature_names
-    >>> X_train, X_test, y_train, y_test = train_test_split(
-    ...     breast_cancer_data.data, breast_cancer_data.target,
-    ...     test_size=0.5, random_state=0)
-    >>> categorical_features = []
-    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
-    ...               random_state=777)
-    >>> model = clf.fit(X_train, y_train)
-    >>> model_task = ModelTask.CLASSIFICATION
-    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
-    ...                          categorical_features, model_task=model_task)
-    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
-    ...             'method': 'less and equal'}]
-    >>> composite_filters = [{'compositeFilters':
-    ...                      [{'compositeFilters':
-    ...                       [{'arg': [13.45, 22.27],
-    ...                         'column': 'mean radius',
-    ...                         'method': 'in the range of'},
-    ...                        {'arg': [10.88, 24.46],
-    ...                         'column': 'mean texture',
-    ...                         'method': 'in the range of'}],
-    ...                        'operation': 'and'}],
-    ...                      'operation': 'or'}]
-    >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
-    ...                         filters, composite_filters)
     """
     if num_bins <= 0:
         raise ValueError(
             'Number of bins parameter must be greater than 0 for the heatmap')
     if features[0] is None and features[1] is None:
         raise ValueError(
             'One or two features must be specified to compute the heat map')
-    filtered_df = filter_from_cohort(analyzer,
-                                     filters,
-                                     composite_filters)
-    true_y = filtered_df[TRUE_Y]
+
+    true_y = dataset[TRUE_Y]
     dropped_cols = [TRUE_Y, ROW_INDEX]
     is_model_analyzer = hasattr(analyzer, 'model')
     if not is_model_analyzer:
-        pred_y = filtered_df[PRED_Y]
+        pred_y = dataset[PRED_Y]
         dropped_cols.append(PRED_Y)
-    input_data = filtered_df.drop(columns=dropped_cols)
+    input_data = dataset.drop(columns=dropped_cols)
     is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
     metric = analyzer.metric
     if is_pandas:
@@ -322,6 +285,75 @@ def compute_matrix(analyzer, features, filters, composite_filters,
     return matrix
 
 
+def compute_matrix(analyzer, features, filters, composite_filters,
+                   quantile_binning=False, num_bins=BIN_THRESHOLD):
+    """Compute a matrix of metrics for a given set of feature names.
+
+    The filters and composite filters are used to filter the data
+    prior to computing the matrix.
+
+    :param analyzer: The error analyzer.
+    :type analyzer: BaseAnalyzer
+    :param features: A list of one or two feature names to compute metrics for.
+    :type features: list
+    :param filters: A list of filters to apply to the data.
+    :type filters: list
+    :param composite_filters: A list of composite filters to apply to the data.
+    :type composite_filters: list
+    :param quantile_binning: Whether to use quantile binning.
+    :type quantile_binning: bool
+    :param num_bins: The number of bins to use for quantile binning.
+    :type num_bins: int
+    :return: A dictionary representation of the computed matrix which can be
+        saved to JSON.
+    :rtype: dict
+
+    :Example:
+
+    An example of running compute_matrix with a filter and a composite
+    filter:
+
+    >>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
+    >>> from erroranalysis._internal.matrix_filter import (
+    ...     compute_matrix)
+    >>> from erroranalysis._internal.constants import ModelTask
+    >>> from sklearn.datasets import load_breast_cancer
+    >>> from sklearn.model_selection import train_test_split
+    >>> from sklearn import svm
+    >>> breast_cancer_data = load_breast_cancer()
+    >>> feature_names = breast_cancer_data.feature_names
+    >>> X_train, X_test, y_train, y_test = train_test_split(
+    ...     breast_cancer_data.data, breast_cancer_data.target,
+    ...     test_size=0.5, random_state=0)
+    >>> categorical_features = []
+    >>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
+    ...               random_state=777)
+    >>> model = clf.fit(X_train, y_train)
+    >>> model_task = ModelTask.CLASSIFICATION
+    >>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
+    ...                          categorical_features, model_task=model_task)
+    >>> filters = [{'arg': [23.85], 'column': 'mean radius',
+    ...             'method': 'less and equal'}]
+    >>> composite_filters = [{'compositeFilters':
+    ...                      [{'compositeFilters':
+    ...                       [{'arg': [13.45, 22.27],
+    ...                         'column': 'mean radius',
+    ...                         'method': 'in the range of'},
+    ...                        {'arg': [10.88, 24.46],
+    ...                         'column': 'mean texture',
+    ...                         'method': 'in the range of'}],
+    ...                        'operation': 'and'}],
+    ...                      'operation': 'or'}]
+    >>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
+    ...                         filters, composite_filters)
+    """
+    filtered_df = filter_from_cohort(analyzer,
+                                     filters,
+                                     composite_filters)
+    return compute_matrix_on_dataset(analyzer, features, filtered_df,
+                                     quantile_binning, num_bins)
+
+
 def convert_dtypes(df):
     """Converts the dtypes of the dataframe to the most efficient type.
 

@@ -96,6 +96,80 @@ def compute_json_error_tree(analyzer,
                               min_child_samples)
 
 
+def compute_error_tree_on_dataset(
+        analyzer,
+        features,
+        dataset,
+        max_depth=DEFAULT_MAX_DEPTH,
+        num_leaves=DEFAULT_NUM_LEAVES,
+        min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
+    """Computes the error tree for the given dataset.
+
+    :param analyzer: The error analyzer containing the categorical
+        features and categories for the full dataset.
+    :type analyzer: BaseAnalyzer
+    :param features: The features to train the surrogate model on.
+    :type features: numpy.ndarray or pandas.DataFrame
+    :param dataset: The dataset on which matrix view needs to be computed.
+        The dataset should have the feature columns and the columns
+        'true_y' and 'index'. The 'true_y' column should have the true
+        target values corresponding to the test data. The 'index'
+        column should have the indices. If the analyzer is of type
+        PredictionsAnalyzer, then the dataset should include the column
+        'pred_y' which will hold the predictions.
+    :type dataset: pd.DataFrame
+    :param max_depth: The maximum depth of the surrogate tree trained
+        on errors.
+    :type max_depth: int
+    :param num_leaves: The number of leaves of the surrogate tree
+        trained on errors.
+    :type num_leaves: int
+    :param min_child_samples: The minimal number of data required to
+        create one leaf.
+    :return: The tree representation as a list of nodes.
+    :rtype: list[dict[str, str]]
+    """
+    if max_depth is None:
+        max_depth = DEFAULT_MAX_DEPTH
+    if num_leaves is None:
+        num_leaves = DEFAULT_NUM_LEAVES
+    if min_child_samples is None:
+        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
+
+    if dataset.shape[0] == 0:
+        return create_empty_node(analyzer.metric)
+    is_model_analyzer = hasattr(analyzer, MODEL)
+    indexes = []
+    for feature in features:
+        indexes.append(analyzer.feature_names.index(feature))
+    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
+    dataset_sub_names = list(dataset_sub_names)
+    if not is_spark(dataset):
+        booster, dataset_indexed_df, cat_info = get_surrogate_booster_local(
+            dataset, analyzer, is_model_analyzer, indexes,
+            dataset_sub_names, max_depth, num_leaves, min_child_samples)
+        cat_ind_reindexed, categories_reindexed = cat_info
+    else:
+        booster, dataset_indexed_df = get_surrogate_booster_pyspark(
+            dataset, analyzer, max_depth, num_leaves, min_child_samples)
+        cat_ind_reindexed = []
+        categories_reindexed = []
+    dumped_model = booster.dump_model()
+    tree_structure = dumped_model["tree_info"][0]['tree_structure']
+    max_split_index = get_max_split_index(tree_structure) + 1
+    cache_subtree_features(tree_structure, dataset_sub_names)
+    tree = traverse(dataset_indexed_df,
+                    tree_structure,
+                    max_split_index,
+                    (categories_reindexed,
+                     cat_ind_reindexed),
+                    [],
+                    dataset_sub_names,
+                    metric=analyzer.metric,
+                    classes=analyzer.classes)
+    return tree
+
+
 def compute_error_tree(analyzer,
                        features,
                        filters,
@@ -165,47 +239,17 @@ def compute_error_tree(analyzer,
     ...                           filters, composite_filters)
     """
     # Fit a surrogate model on errors
-    if max_depth is None:
-        max_depth = DEFAULT_MAX_DEPTH
-    if num_leaves is None:
-        num_leaves = DEFAULT_NUM_LEAVES
-    if min_child_samples is None:
-        min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
     filtered_df = filter_from_cohort(analyzer,
                                      filters,
                                      composite_filters)
-    if filtered_df.shape[0] == 0:
-        return create_empty_node(analyzer.metric)
-    is_model_analyzer = hasattr(analyzer, MODEL)
-    indexes = []
-    for feature in features:
-        indexes.append(analyzer.feature_names.index(feature))
-    dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
-    dataset_sub_names = list(dataset_sub_names)
-    if not is_spark(filtered_df):
-        booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
-            filtered_df, analyzer, is_model_analyzer, indexes,
-            dataset_sub_names, max_depth, num_leaves, min_child_samples)
-        cat_ind_reindexed, categories_reindexed = cat_info
-    else:
-        booster, filtered_indexed_df = get_surrogate_booster_pyspark(
-            filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
-        cat_ind_reindexed = []
-        categories_reindexed = []
-    dumped_model = booster.dump_model()
-    tree_structure = dumped_model["tree_info"][0]['tree_structure']
-    max_split_index = get_max_split_index(tree_structure) + 1
-    cache_subtree_features(tree_structure, dataset_sub_names)
-    tree = traverse(filtered_indexed_df,
-                    tree_structure,
-                    max_split_index,
-                    (categories_reindexed,
-                     cat_ind_reindexed),
-                    [],
-                    dataset_sub_names,
-                    metric=analyzer.metric,
-                    classes=analyzer.classes)
-    return tree
+    return compute_error_tree_on_dataset(
+        analyzer,
+        features,
+        filtered_df,
+        max_depth=DEFAULT_MAX_DEPTH,
+        num_leaves=DEFAULT_NUM_LEAVES,
+        min_child_samples=DEFAULT_MIN_CHILD_SAMPLES
+    )
 
 
 def get_surrogate_booster_local(filtered_df, analyzer, is_model_analyzer,