Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge postga branch into main #1815

Merged
merged 32 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
6f16591
Add postga build trigger (#1755) (#1756)
gaugup Sep 29, 2022
a54a531
Merge branch 'main' into postga
gaugup Oct 3, 2022
7bb4087
Add model wrapper for wrapping predictions and test data (#1762)
gaugup Oct 5, 2022
8395b89
Merge branch 'main' into postga
gaugup Oct 7, 2022
c400c30
Merge branch 'main' into postga
gaugup Oct 7, 2022
d0d338e
Change description of cohort selection panel in Aggregate Feature Imp…
gaugup Oct 13, 2022
18af5ab
Support cohort filtering of string target in rai_insights (#1771)
gaugup Oct 18, 2022
c23be59
Simplify tests in test_cohort_filter.py (#1772)
gaugup Oct 18, 2022
27fb668
Move cohort re-labelling logic to Cohort.ts (#1773)
gaugup Oct 19, 2022
5609f3a
Update requirements.txt (#1779)
gaugup Oct 20, 2022
68e1e2a
Refactor DataAnalysisTab into Chartview/DataAnalysisView and DataBala…
gaugup Oct 20, 2022
a5ecb69
[Highcharts] Enable csv download, xls download and view data table in…
tongyu-microsoft Oct 20, 2022
e046f08
Merge branch 'main' into postga
gaugup Oct 25, 2022
5f0d1c8
Merge branch 'main' into postga
gaugup Oct 26, 2022
0a45019
Add new data APIs for compute_matrix() and compute_error_tree() (#1790)
gaugup Oct 28, 2022
2183db1
update e2e visit function (#1792)
tongyu-microsoft Oct 27, 2022
9438ac6
Add rai_test_utils package (#1777)
gaugup Oct 29, 2022
3930624
Merge branch 'main' into postga
gaugup Oct 30, 2022
1ff6c9d
Merge branch 'main' into postga
gaugup Oct 31, 2022
31609b9
sort fix (#1798)
vinuthakaranth Nov 1, 2022
105091c
Merge branch 'main' into postga
gaugup Nov 3, 2022
786f4ff
Merge branch 'main' into postga
gaugup Nov 5, 2022
3ad308b
[LogarithmicScaling] Enable log scaling for data explorer, feature im…
tongyu-microsoft Nov 7, 2022
9b5622a
Update CODEOWNERS (#1805)
tongyu-microsoft Nov 7, 2022
e88afcd
Fix ‘Close button’ and ‘Spin button’ present in new cohort dialog doe…
vinuthakaranth Nov 8, 2022
82c0c56
Update erroranalysis tests to use new rai_test_utils package (#1800)
gaugup Nov 8, 2022
771a4e7
Fix Heatmap tooltip theme (#1808)
vinuthakaranth Nov 8, 2022
73a9314
Merge branch 'main' into postga
gaugup Nov 9, 2022
8f94937
Calculate RAI insights for first 5K samples (#1803)
gaugup Nov 9, 2022
176bfea
Add API for indicating large data scenario in ModelAssessmentContext …
gaugup Nov 10, 2022
8201643
Add RAI version number to oss serialize result (#1812)
tongyu-microsoft Nov 11, 2022
990e086
Merge branch 'main' into postga
gaugup Nov 15, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .eslintrc/.eslintrc.custom.eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
"feature_value",
"global_effects",
"identity_feature_name",
"is_large_data_scenario",
"local_effects",
"local_importance",
"local_policies",
Expand Down Expand Up @@ -130,6 +131,7 @@
"true_positive_rate_ratio",
"true_y",
"upper_bounds",
"use_entire_test_data",
"zero_one_loss"
]
}
Expand Down
1 change: 1 addition & 0 deletions .eslintrc/.eslintrc.import.eslintrc
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"regenerator-runtime/runtime.js",
"highcharts/modules/accessibility",
"highcharts/modules/exporting",
"highcharts/modules/export-data",
"highcharts/highcharts-more",
"highcharts/modules/heatmap",
"highcharts/modules/pattern-fill",
Expand Down
134 changes: 83 additions & 51 deletions erroranalysis/erroranalysis/_internal/matrix_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ def compute_json_matrix(analyzer, features, filters, composite_filters):
return compute_matrix(analyzer, features, filters, composite_filters)


def compute_matrix(analyzer, features, filters, composite_filters,
quantile_binning=False, num_bins=BIN_THRESHOLD):
def compute_matrix_on_dataset(analyzer, features, dataset,
quantile_binning=False, num_bins=BIN_THRESHOLD):
"""Compute a matrix of metrics for a given set of feature names.

The filters and composite filters are used to filter the data
Expand All @@ -71,73 +71,36 @@ def compute_matrix(analyzer, features, filters, composite_filters,
:type analyzer: BaseAnalyzer
:param features: A list of one or two feature names to compute metrics for.
:type features: list
:param filters: A list of filters to apply to the data.
:type filters: list
:param composite_filters: A list of composite filters to apply to the data.
:type composite_filters: list
:param dataset: The dataset on which matrix view needs to be computed.
The dataset should have the feature columns and the columns
'true_y' and 'index'. The 'true_y' column should have the true
target values corresponding to the test data. The 'index'
column should have the indices. If the analyzer is of type
PredictionsAnalyzer, then the dataset should include the column
'pred_y' which will hold the predictions.
:type dataset: pd.DataFrame
:param quantile_binning: Whether to use quantile binning.
:type quantile_binning: bool
:param num_bins: The number of bins to use for quantile binning.
:type num_bins: int
:return: A dictionary representation of the computed matrix which can be
saved to JSON.
:rtype: dict

:Example:

An example of running compute_matrix with a filter and a composite
filter:

>>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
>>> from erroranalysis._internal.matrix_filter import (
... compute_matrix)
>>> from erroranalysis._internal.constants import ModelTask
>>> from sklearn.datasets import load_breast_cancer
>>> from sklearn.model_selection import train_test_split
>>> from sklearn import svm
>>> breast_cancer_data = load_breast_cancer()
>>> feature_names = breast_cancer_data.feature_names
>>> X_train, X_test, y_train, y_test = train_test_split(
... breast_cancer_data.data, breast_cancer_data.target,
... test_size=0.5, random_state=0)
>>> categorical_features = []
>>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
... random_state=777)
>>> model = clf.fit(X_train, y_train)
>>> model_task = ModelTask.CLASSIFICATION
>>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
... categorical_features, model_task=model_task)
>>> filters = [{'arg': [23.85], 'column': 'mean radius',
... 'method': 'less and equal'}]
>>> composite_filters = [{'compositeFilters':
... [{'compositeFilters':
... [{'arg': [13.45, 22.27],
... 'column': 'mean radius',
... 'method': 'in the range of'},
... {'arg': [10.88, 24.46],
... 'column': 'mean texture',
... 'method': 'in the range of'}],
... 'operation': 'and'}],
... 'operation': 'or'}]
>>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
... filters, composite_filters)
"""
if num_bins <= 0:
raise ValueError(
'Number of bins parameter must be greater than 0 for the heatmap')
if features[0] is None and features[1] is None:
raise ValueError(
'One or two features must be specified to compute the heat map')
filtered_df = filter_from_cohort(analyzer,
filters,
composite_filters)
true_y = filtered_df[TRUE_Y]

true_y = dataset[TRUE_Y]
dropped_cols = [TRUE_Y, ROW_INDEX]
is_model_analyzer = hasattr(analyzer, 'model')
if not is_model_analyzer:
pred_y = filtered_df[PRED_Y]
pred_y = dataset[PRED_Y]
dropped_cols.append(PRED_Y)
input_data = filtered_df.drop(columns=dropped_cols)
input_data = dataset.drop(columns=dropped_cols)
is_pandas = isinstance(analyzer.dataset, pd.DataFrame)
metric = analyzer.metric
if is_pandas:
Expand Down Expand Up @@ -322,6 +285,75 @@ def compute_matrix(analyzer, features, filters, composite_filters,
return matrix


def compute_matrix(analyzer, features, filters, composite_filters,
quantile_binning=False, num_bins=BIN_THRESHOLD):
"""Compute a matrix of metrics for a given set of feature names.

The filters and composite filters are used to filter the data
prior to computing the matrix.

:param analyzer: The error analyzer.
:type analyzer: BaseAnalyzer
:param features: A list of one or two feature names to compute metrics for.
:type features: list
:param filters: A list of filters to apply to the data.
:type filters: list
:param composite_filters: A list of composite filters to apply to the data.
:type composite_filters: list
:param quantile_binning: Whether to use quantile binning.
:type quantile_binning: bool
:param num_bins: The number of bins to use for quantile binning.
:type num_bins: int
:return: A dictionary representation of the computed matrix which can be
saved to JSON.
:rtype: dict

:Example:

An example of running compute_matrix with a filter and a composite
filter:

>>> from erroranalysis._internal.error_analyzer import ModelAnalyzer
>>> from erroranalysis._internal.matrix_filter import (
... compute_matrix)
>>> from erroranalysis._internal.constants import ModelTask
>>> from sklearn.datasets import load_breast_cancer
>>> from sklearn.model_selection import train_test_split
>>> from sklearn import svm
>>> breast_cancer_data = load_breast_cancer()
>>> feature_names = breast_cancer_data.feature_names
>>> X_train, X_test, y_train, y_test = train_test_split(
... breast_cancer_data.data, breast_cancer_data.target,
... test_size=0.5, random_state=0)
>>> categorical_features = []
>>> clf = svm.SVC(gamma=0.001, C=100., probability=True,
... random_state=777)
>>> model = clf.fit(X_train, y_train)
>>> model_task = ModelTask.CLASSIFICATION
>>> analyzer = ModelAnalyzer(model, X_test, y_test, feature_names,
... categorical_features, model_task=model_task)
>>> filters = [{'arg': [23.85], 'column': 'mean radius',
... 'method': 'less and equal'}]
>>> composite_filters = [{'compositeFilters':
... [{'compositeFilters':
... [{'arg': [13.45, 22.27],
... 'column': 'mean radius',
... 'method': 'in the range of'},
... {'arg': [10.88, 24.46],
... 'column': 'mean texture',
... 'method': 'in the range of'}],
... 'operation': 'and'}],
... 'operation': 'or'}]
>>> matrix = compute_matrix(analyzer, ['mean radius', 'mean texture'],
... filters, composite_filters)
"""
filtered_df = filter_from_cohort(analyzer,
filters,
composite_filters)
return compute_matrix_on_dataset(analyzer, features, filtered_df,
quantile_binning, num_bins)


def convert_dtypes(df):
"""Converts the dtypes of the dataframe to the most efficient type.

Expand Down
120 changes: 82 additions & 38 deletions erroranalysis/erroranalysis/_internal/surrogate_error_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,80 @@ def compute_json_error_tree(analyzer,
min_child_samples)


def compute_error_tree_on_dataset(
analyzer,
features,
dataset,
max_depth=DEFAULT_MAX_DEPTH,
num_leaves=DEFAULT_NUM_LEAVES,
min_child_samples=DEFAULT_MIN_CHILD_SAMPLES):
"""Computes the error tree for the given dataset.

:param analyzer: The error analyzer containing the categorical
features and categories for the full dataset.
:type analyzer: BaseAnalyzer
:param features: The features to train the surrogate model on.
:type features: numpy.ndarray or pandas.DataFrame
:param dataset: The dataset on which matrix view needs to be computed.
The dataset should have the feature columns and the columns
'true_y' and 'index'. The 'true_y' column should have the true
target values corresponding to the test data. The 'index'
column should have the indices. If the analyzer is of type
PredictionsAnalyzer, then the dataset should include the column
'pred_y' which will hold the predictions.
:type dataset: pd.DataFrame
:param max_depth: The maximum depth of the surrogate tree trained
on errors.
:type max_depth: int
:param num_leaves: The number of leaves of the surrogate tree
trained on errors.
:type num_leaves: int
:param min_child_samples: The minimal number of data required to
create one leaf.
:return: The tree representation as a list of nodes.
:rtype: list[dict[str, str]]
"""
if max_depth is None:
max_depth = DEFAULT_MAX_DEPTH
if num_leaves is None:
num_leaves = DEFAULT_NUM_LEAVES
if min_child_samples is None:
min_child_samples = DEFAULT_MIN_CHILD_SAMPLES

if dataset.shape[0] == 0:
return create_empty_node(analyzer.metric)
is_model_analyzer = hasattr(analyzer, MODEL)
indexes = []
for feature in features:
indexes.append(analyzer.feature_names.index(feature))
dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
dataset_sub_names = list(dataset_sub_names)
if not is_spark(dataset):
booster, dataset_indexed_df, cat_info = get_surrogate_booster_local(
dataset, analyzer, is_model_analyzer, indexes,
dataset_sub_names, max_depth, num_leaves, min_child_samples)
cat_ind_reindexed, categories_reindexed = cat_info
else:
booster, dataset_indexed_df = get_surrogate_booster_pyspark(
dataset, analyzer, max_depth, num_leaves, min_child_samples)
cat_ind_reindexed = []
categories_reindexed = []
dumped_model = booster.dump_model()
tree_structure = dumped_model["tree_info"][0]['tree_structure']
max_split_index = get_max_split_index(tree_structure) + 1
cache_subtree_features(tree_structure, dataset_sub_names)
tree = traverse(dataset_indexed_df,
tree_structure,
max_split_index,
(categories_reindexed,
cat_ind_reindexed),
[],
dataset_sub_names,
metric=analyzer.metric,
classes=analyzer.classes)
return tree


def compute_error_tree(analyzer,
features,
filters,
Expand Down Expand Up @@ -165,47 +239,17 @@ def compute_error_tree(analyzer,
... filters, composite_filters)
"""
# Fit a surrogate model on errors
if max_depth is None:
max_depth = DEFAULT_MAX_DEPTH
if num_leaves is None:
num_leaves = DEFAULT_NUM_LEAVES
if min_child_samples is None:
min_child_samples = DEFAULT_MIN_CHILD_SAMPLES
filtered_df = filter_from_cohort(analyzer,
filters,
composite_filters)
if filtered_df.shape[0] == 0:
return create_empty_node(analyzer.metric)
is_model_analyzer = hasattr(analyzer, MODEL)
indexes = []
for feature in features:
indexes.append(analyzer.feature_names.index(feature))
dataset_sub_names = np.array(analyzer.feature_names)[np.array(indexes)]
dataset_sub_names = list(dataset_sub_names)
if not is_spark(filtered_df):
booster, filtered_indexed_df, cat_info = get_surrogate_booster_local(
filtered_df, analyzer, is_model_analyzer, indexes,
dataset_sub_names, max_depth, num_leaves, min_child_samples)
cat_ind_reindexed, categories_reindexed = cat_info
else:
booster, filtered_indexed_df = get_surrogate_booster_pyspark(
filtered_df, analyzer, max_depth, num_leaves, min_child_samples)
cat_ind_reindexed = []
categories_reindexed = []
dumped_model = booster.dump_model()
tree_structure = dumped_model["tree_info"][0]['tree_structure']
max_split_index = get_max_split_index(tree_structure) + 1
cache_subtree_features(tree_structure, dataset_sub_names)
tree = traverse(filtered_indexed_df,
tree_structure,
max_split_index,
(categories_reindexed,
cat_ind_reindexed),
[],
dataset_sub_names,
metric=analyzer.metric,
classes=analyzer.classes)
return tree
return compute_error_tree_on_dataset(
analyzer,
features,
filtered_df,
max_depth=DEFAULT_MAX_DEPTH,
num_leaves=DEFAULT_NUM_LEAVES,
min_child_samples=DEFAULT_MIN_CHILD_SAMPLES
)


def get_surrogate_booster_local(filtered_df, analyzer, is_model_analyzer,
Expand Down
Loading