diff --git a/raiwidgets/raiwidgets/_cohort.py b/raiwidgets/raiwidgets/_cohort.py index 532eb16fad..5e490cfa3a 100644 --- a/raiwidgets/raiwidgets/_cohort.py +++ b/raiwidgets/raiwidgets/_cohort.py @@ -82,7 +82,7 @@ class CohortFilter: PREDICTED_Y = 'Predicted Y' TRUE_Y = 'True Y' INDEX = 'Index' - CLASSIFICATION_OUTCOME = 'Classification Outcome' + CLASSIFICATION_OUTCOME = 'Classification outcome' REGRESSION_ERROR = 'Error' SPECIAL_COLUMN_LIST = [INDEX, @@ -130,11 +130,11 @@ def _validate_cohort_filter_parameters( 3. The arg shouldn't be an empty list. 4. For all cohort filter methods in CohortFilterMethods.SINGLE_VALUE_METHODS, the value in the arg - should be integer or float and there should be utmost one value + should be integer or float and there should be only one value in arg. 5. For cohort filter method CohortFilterMethods.METHOD_RANGE, the values in the arg should be integer or float and there - should be utmost two values in arg. + should be only two values in arg. """ if not isinstance(method, str): raise UserConfigValidationException( @@ -223,12 +223,12 @@ def _validate_with_test_data(self, test_data: pd.DataFrame, 2. The Index filter doesn't take CohortFilterMethods.EXCLUDES filter method. - "Classification Outcome" Filter validations - 1. Validate that "Classification Outcome" filter is not configure for + "Classification outcome" Filter validations + 1. Validate that "Classification outcome" filter is not configure for multiclass classification and regression. - 2. The "Classification Outcome" filter only contains values from set + 2. The "Classification outcome" filter only contains values from set ClassificationOutcomes. - 3. The "Classification Outcome" filter only takes + 3. The "Classification outcome" filter only takes CohortFilterMethods.INCLUDES filter method. "Error" Filter validations @@ -279,7 +279,7 @@ def _validate_with_test_data(self, test_data: pd.DataFrame, "All entries in arg should be of type int." ) elif self.column == CohortFilter.CLASSIFICATION_OUTCOME: - # "Classification Outcome" Filter validations + # "Classification outcome" Filter validations is_multiclass = len(np.unique( test_data[target_column].values).tolist()) > 2 @@ -370,11 +370,13 @@ def _validate_with_test_data(self, test_data: pd.DataFrame, categories = np.unique( test_data[self.column].values).tolist() - if not all(entry in categories for entry in self.arg): - raise UserConfigValidationException( - "Found a category in arg which is not present in " - "test data" - ) + for entry in self.arg: + if entry not in categories: + raise UserConfigValidationException( + "Found a category {0} in arg which is not present " + "in test data column {1}.".format( + entry, self.column) + ) class Cohort: @@ -441,10 +443,12 @@ def _validate_with_test_data(self, test_data: pd.DataFrame, if not isinstance(categorical_features, list): raise UserConfigValidationException( "Expected a list type for categorical columns.") - if not all(isinstance(entry, str) for entry in categorical_features): - raise UserConfigValidationException( - "All entries in categorical_features need of string type." - ) + for categorical_feature in categorical_features: + if not isinstance(categorical_feature, str): + raise UserConfigValidationException( + "Feature {0} in categorical_features need to be of " + "string type.".format(categorical_feature) + ) if target_column not in test_data.columns: raise UserConfigValidationException( @@ -453,12 +457,12 @@ def _validate_with_test_data(self, test_data: pd.DataFrame, ) test_data_columns_set = set(test_data.columns) - set([target_column]) - if not all(entry in test_data_columns_set - for entry in categorical_features): - raise UserConfigValidationException( - "Found some categorical feature name which is not" - " present in test data." - ) + for categorical_feature in categorical_features: + if categorical_feature not in test_data_columns_set: + raise UserConfigValidationException( + "Found categorical feature {0} which is not" + " present in test data.".format(categorical_feature) + ) for cohort_filter in self.cohort_filter_list: cohort_filter._validate_with_test_data( diff --git a/raiwidgets/tests/test_cohort.py b/raiwidgets/tests/test_cohort.py index 63215bdbd5..8bae46f415 100644 --- a/raiwidgets/tests/test_cohort.py +++ b/raiwidgets/tests/test_cohort.py @@ -12,6 +12,23 @@ from responsibleai.exceptions import UserConfigValidationException +def get_toy_binary_classification_dataset(): + return pd.DataFrame(data=[[23, 'X'], [25, 'Y']], + columns=["age", "target"]) + + +def get_toy_multiclass_classification_dataset(): + return pd.DataFrame( + data=[[23, 'X'], [25, 'Y'], [25, 'Z']], + columns=["age", "target"]) + + +def get_toy_regression_dataset(): + return pd.DataFrame( + data=[[23, 2.5], [25, 3.6], [25, 4.6]], + columns=["age", "target"]) + + class TestCohortFilter: def test_cohort_filter_validate_method(self): with pytest.raises( @@ -103,9 +120,8 @@ def test_cohort_filter_validate_in_range_methods_type_arg_entries( @pytest.mark.parametrize('method', CohortFilterMethods.SINGLE_VALUE_METHODS) def test_cohort_filter_serialization_single_value_methods(self, method): - cohort_filter_1 = \ - CohortFilter(method=method, - arg=[65.0], column='age') + cohort_filter_1 = CohortFilter(method=method, + arg=[65.0], column='age') json_str = json.dumps(cohort_filter_1, default=cohort_filter_json_converter) assert method in json_str @@ -113,9 +129,9 @@ def test_cohort_filter_serialization_single_value_methods(self, method): assert 'age' in json_str def test_cohort_filter_serialization_in_range_method(self): - cohort_filter_1 = \ - CohortFilter(method=CohortFilterMethods.METHOD_RANGE, - arg=[65.0, 70.0], column='age') + cohort_filter_1 = CohortFilter( + method=CohortFilterMethods.METHOD_RANGE, + arg=[65.0, 70.0], column='age') json_str = json.dumps(cohort_filter_1, default=cohort_filter_json_converter) assert CohortFilterMethods.METHOD_RANGE in json_str @@ -127,9 +143,9 @@ def test_cohort_filter_serialization_in_range_method(self): [CohortFilterMethods.METHOD_INCLUDES, CohortFilterMethods.METHOD_EXCLUDES]) def test_cohort_filter_serialization_include_exclude_methods(self, method): - cohort_filter_str = \ - CohortFilter(method=method, - arg=['val1', 'val2', 'val3'], column='age') + cohort_filter_str = CohortFilter(method=method, + arg=['val1', 'val2', 'val3'], + column='age') json_str = json.dumps(cohort_filter_str, default=cohort_filter_json_converter) assert method in json_str @@ -138,9 +154,9 @@ def test_cohort_filter_serialization_include_exclude_methods(self, method): assert 'val3' in json_str assert 'age' in json_str - cohort_filter_int = \ - CohortFilter(method=method, - arg=[1, 2, 3], column='age') + cohort_filter_int = CohortFilter(method=method, + arg=[1, 2, 3], + column='age') json_str = json.dumps(cohort_filter_int, default=cohort_filter_json_converter) assert method in json_str @@ -152,12 +168,11 @@ def test_cohort_filter_serialization_include_exclude_methods(self, method): class TestCohortFilterDataValidations: def test_validate_with_test_data_high_level_validations(self): - test_data = pd.DataFrame(data=[[23, 'X'], [25, 'Y']], - columns=["age", "target"]) + test_data = get_toy_binary_classification_dataset() - cohort_filter_not_a_feature = \ - CohortFilter(method=CohortFilterMethods.METHOD_LESS, - arg=[65], column='fake_column') + cohort_filter_not_a_feature = CohortFilter( + method=CohortFilterMethods.METHOD_LESS, + arg=[65], column='fake_column') with pytest.raises( UserConfigValidationException, @@ -167,12 +182,11 @@ def test_validate_with_test_data_high_level_validations(self): categorical_features=[]) def test_validate_with_test_data_index_filter_validations(self): - test_data = pd.DataFrame(data=[[23, 'X'], [25, 'Y']], - columns=["age", "target"]) + test_data = get_toy_binary_classification_dataset() - cohort_filter_index_excludes = \ - CohortFilter(method=CohortFilterMethods.METHOD_EXCLUDES, - arg=[65], column=CohortFilter.INDEX) + cohort_filter_index_excludes = CohortFilter( + method=CohortFilterMethods.METHOD_EXCLUDES, + arg=[65], column=CohortFilter.INDEX) with pytest.raises( UserConfigValidationException, match="excludes filter is not supported with Index based " @@ -182,9 +196,9 @@ def test_validate_with_test_data_index_filter_validations(self): categorical_features=[] ) - cohort_filter_index_incorrect_args = \ - CohortFilter(method=CohortFilterMethods.METHOD_GREATER, - arg=[65.0], column=CohortFilter.INDEX) + cohort_filter_index_incorrect_args = CohortFilter( + method=CohortFilterMethods.METHOD_GREATER, + arg=[65.0], column=CohortFilter.INDEX) with pytest.raises( UserConfigValidationException, match="All entries in arg should be of type int."): @@ -195,27 +209,23 @@ def test_validate_with_test_data_index_filter_validations(self): def test_validate_with_test_data_classification_error_filter_validations( self): - test_data_multiclass = pd.DataFrame( - data=[[23, 'X'], [25, 'Y'], [25, 'Z']], - columns=["age", "target"]) + test_data_multiclass = get_toy_multiclass_classification_dataset() - test_data_binary = pd.DataFrame( - data=[[23, 'X'], [25, 'Y']], - columns=["age", "target"]) + test_data_binary = get_toy_binary_classification_dataset() - cohort_filter_classification_excludes = \ - CohortFilter(method=CohortFilterMethods.METHOD_EXCLUDES, - arg=[ClassificationOutcomes.FALSE_NEGATIVE], - column=CohortFilter.CLASSIFICATION_OUTCOME) + cohort_filter_classification_excludes = CohortFilter( + method=CohortFilterMethods.METHOD_EXCLUDES, + arg=[ClassificationOutcomes.FALSE_NEGATIVE], + column=CohortFilter.CLASSIFICATION_OUTCOME) - cohort_filter_classification_includes = \ - CohortFilter(method=CohortFilterMethods.METHOD_INCLUDES, - arg=["random"], - column=CohortFilter.CLASSIFICATION_OUTCOME) + cohort_filter_classification_includes = CohortFilter( + method=CohortFilterMethods.METHOD_INCLUDES, + arg=["random"], + column=CohortFilter.CLASSIFICATION_OUTCOME) with pytest.raises( UserConfigValidationException, - match="Classification Outcome cannot be " + match="Classification outcome cannot be " "configured for multi-class classification" " and regression scenarios."): cohort_filter_classification_excludes._validate_with_test_data( @@ -225,7 +235,7 @@ def test_validate_with_test_data_classification_error_filter_validations( with pytest.raises( UserConfigValidationException, - match="Classification Outcome cannot be " + match="Classification outcome cannot be " "configured for multi-class classification" " and regression scenarios."): cohort_filter_classification_excludes._validate_with_test_data( @@ -235,7 +245,7 @@ def test_validate_with_test_data_classification_error_filter_validations( with pytest.raises( UserConfigValidationException, - match="Classification Outcome can only be configured with " + match="Classification outcome can only be configured with " "cohort filter includes."): cohort_filter_classification_excludes._validate_with_test_data( test_data=test_data_binary, target_column="target", @@ -244,7 +254,7 @@ def test_validate_with_test_data_classification_error_filter_validations( with pytest.raises( UserConfigValidationException, - match="Classification Outcome can only take argument values " + match="Classification outcome can only take argument values " "from False negative or False positive or True " "negative or True positive."): cohort_filter_classification_includes._validate_with_test_data( @@ -253,14 +263,12 @@ def test_validate_with_test_data_classification_error_filter_validations( def test_validate_with_test_data_regression_error_filter_validations( self): - test_data_regression = pd.DataFrame( - data=[[23, 2.5], [25, 3.6], [25, 4.6]], - columns=["age", "target"]) + test_data_regression = get_toy_regression_dataset() - cohort_filter_regression = \ - CohortFilter(method=CohortFilterMethods.METHOD_LESS, - arg=[2.5], - column=CohortFilter.REGRESSION_ERROR) + cohort_filter_regression = CohortFilter( + method=CohortFilterMethods.METHOD_LESS, + arg=[2.5], + column=CohortFilter.REGRESSION_ERROR) with pytest.raises( UserConfigValidationException, @@ -317,19 +325,17 @@ def test_validate_with_test_data_regression_error_filter_validations( CohortFilterMethods.METHOD_EXCLUDES]) def test_validate_with_test_data_regression_target_filter_validations( self, target_filter_type, method): - test_data_regression = pd.DataFrame( - data=[[23, 2.5], [25, 3.6], [25, 4.6]], - columns=["age", "target"]) + test_data_regression = get_toy_regression_dataset() with pytest.raises( UserConfigValidationException, match="{0} cannot be configured with " "filter {1} for regression.".format(target_filter_type, method)): - cohort_filter_regression = \ - CohortFilter(method=method, - arg=[2.5], - column=target_filter_type) + cohort_filter_regression = CohortFilter( + method=method, + arg=[2.5], + column=target_filter_type) cohort_filter_regression._validate_with_test_data( test_data=test_data_regression, target_column="target", @@ -341,9 +347,7 @@ def test_validate_with_test_data_regression_target_filter_validations( CohortFilter.TRUE_Y]) def test_validate_with_test_data_classification_target_filter_validations( self, target_filter_type): - test_data_classification = pd.DataFrame( - data=[[23, 'A'], [25, 'B'], [25, 'B']], - columns=["age", "target"]) + test_data_classification = get_toy_binary_classification_dataset() with pytest.raises( UserConfigValidationException, @@ -351,10 +355,10 @@ def test_validate_with_test_data_classification_target_filter_validations( "filter {1} for classification".format( target_filter_type, CohortFilterMethods.METHOD_INCLUDES)): - cohort_filter_classification = \ - CohortFilter(method=CohortFilterMethods.METHOD_EXCLUDES, - arg=['A'], - column=target_filter_type) + cohort_filter_classification = CohortFilter( + method=CohortFilterMethods.METHOD_EXCLUDES, + arg=['X'], + column=target_filter_type) cohort_filter_classification._validate_with_test_data( test_data=test_data_classification, target_column="target", @@ -365,10 +369,10 @@ def test_validate_with_test_data_classification_target_filter_validations( UserConfigValidationException, match="Found a class in arg which is not present in " "test data"): - cohort_filter_classification = \ - CohortFilter(method=CohortFilterMethods.METHOD_INCLUDES, - arg=['Z'], - column=target_filter_type) + cohort_filter_classification = CohortFilter( + method=CohortFilterMethods.METHOD_INCLUDES, + arg=['Z'], + column=target_filter_type) cohort_filter_classification._validate_with_test_data( test_data=test_data_classification, target_column="target", @@ -387,10 +391,10 @@ def test_validate_with_test_data_with_dataset_validations( "configured with {1} cohort filter.".format( "type", CohortFilterMethods.METHOD_INCLUDES)): - cohort_filter = \ - CohortFilter(method=CohortFilterMethods.METHOD_EXCLUDES, - arg=['new'], - column='type') + cohort_filter = CohortFilter( + method=CohortFilterMethods.METHOD_EXCLUDES, + arg=['new'], + column='type') cohort_filter._validate_with_test_data( test_data=test_data, target_column="target", @@ -399,12 +403,12 @@ def test_validate_with_test_data_with_dataset_validations( with pytest.raises( UserConfigValidationException, - match="Found a category in arg which is not present in " - "test data"): - cohort_filter = \ - CohortFilter(method=CohortFilterMethods.METHOD_INCLUDES, - arg=['mid'], - column='type') + match="Found a category {0} in arg which is not present " + "in test data column {1}.".format('mid', 'type')): + cohort_filter = CohortFilter( + method=CohortFilterMethods.METHOD_INCLUDES, + arg=['mid'], + column='type') cohort_filter._validate_with_test_data( test_data=test_data, target_column="target", @@ -428,19 +432,18 @@ def test_cohort_configuration_validations(self): cohort.add_cohort_filter(cohort_filter=[]) def test_cohort_validate_with_test_data(self): - cohort_filter_1 = \ - CohortFilter(method=CohortFilterMethods.METHOD_LESS, - arg=[65], column='age') + cohort_filter_1 = CohortFilter( + method=CohortFilterMethods.METHOD_LESS, + arg=[65], column='age') cohort_1 = Cohort(name="Cohort New") cohort_1.add_cohort_filter(cohort_filter_1) - test_data = pd.DataFrame(data=[[23, 'X'], [25, 'Y']], - columns=["age", "target"]) + test_data = get_toy_binary_classification_dataset() with pytest.raises( UserConfigValidationException, match="The test_data should be a pandas DataFrame"): cohort_1._validate_with_test_data( - test_data=[], target_column='income', + test_data=[], target_column='target', categorical_features=[]) with pytest.raises( @@ -471,8 +474,8 @@ def test_cohort_validate_with_test_data(self): with pytest.raises( UserConfigValidationException, - match="All entries in categorical_features " - "need of string type."): + match="Feature 1 in categorical_features need to be of " + "string type."): cohort_1._validate_with_test_data( test_data=test_data, target_column="target", @@ -480,7 +483,7 @@ def test_cohort_validate_with_test_data(self): with pytest.raises( UserConfigValidationException, - match="Found some categorical feature name which is not" + match="Found categorical feature hours-per-week which is not" " present in test data."): cohort_1._validate_with_test_data( test_data=test_data, @@ -490,9 +493,8 @@ def test_cohort_validate_with_test_data(self): @pytest.mark.parametrize('method', CohortFilterMethods.SINGLE_VALUE_METHODS) def test_cohort_serialization_single_value_method(self, method): - cohort_filter_1 = \ - CohortFilter(method=method, - arg=[65], column='age') + cohort_filter_1 = CohortFilter(method=method, + arg=[65], column='age') cohort_1 = Cohort(name="Cohort New") cohort_1.add_cohort_filter(cohort_filter_1) json_str = json.dumps(cohort_1, @@ -504,9 +506,9 @@ def test_cohort_serialization_single_value_method(self, method): assert 'age' in json_str def test_cohort_serialization_in_range_method(self): - cohort_filter_1 = \ - CohortFilter(method=CohortFilterMethods.METHOD_RANGE, - arg=[65.0, 70.0], column='age') + cohort_filter_1 = CohortFilter( + method=CohortFilterMethods.METHOD_RANGE, + arg=[65.0, 70.0], column='age') cohort_1 = Cohort(name="Cohort New") cohort_1.add_cohort_filter(cohort_filter_1) json_str = json.dumps(cohort_1, @@ -522,9 +524,9 @@ def test_cohort_serialization_in_range_method(self): [CohortFilterMethods.METHOD_INCLUDES, CohortFilterMethods.METHOD_EXCLUDES]) def test_cohort_serialization_include_exclude_methods(self, method): - cohort_filter_str = \ - CohortFilter(method=method, - arg=['val1', 'val2', 'val3'], column='age') + cohort_filter_str = CohortFilter(method=method, + arg=['val1', 'val2', 'val3'], + column='age') cohort_str = Cohort(name="Cohort New Str") cohort_str.add_cohort_filter(cohort_filter_str) json_str = json.dumps(cohort_str, @@ -535,9 +537,9 @@ def test_cohort_serialization_include_exclude_methods(self, method): assert 'val3' in json_str assert 'age' in json_str - cohort_filter_int = \ - CohortFilter(method=method, - arg=[1, 2, 3], column='age') + cohort_filter_int = CohortFilter(method=method, + arg=[1, 2, 3], + column='age') cohort_int = Cohort(name="Cohort New Int") cohort_int.add_cohort_filter(cohort_filter_int) json_str = json.dumps(cohort_filter_int, @@ -551,9 +553,9 @@ def test_cohort_serialization_include_exclude_methods(self, method): class TestCohortList: def test_cohort_list_serialization(self): - cohort_filter_1 = \ - CohortFilter(method=CohortFilterMethods.METHOD_LESS, - arg=[65], column='age') + cohort_filter_1 = CohortFilter( + method=CohortFilterMethods.METHOD_LESS, + arg=[65], column='age') cohort_1 = Cohort(name="Cohort New") cohort_1.add_cohort_filter(cohort_filter_1)