diff --git a/responsibleai/responsibleai/managers/error_analysis_manager.py b/responsibleai/responsibleai/managers/error_analysis_manager.py index 5c5cd929c9..aef217314b 100644 --- a/responsibleai/responsibleai/managers/error_analysis_manager.py +++ b/responsibleai/responsibleai/managers/error_analysis_manager.py @@ -253,8 +253,11 @@ def __init__(self, model: Any, dataset: pd.DataFrame, target_column: str, for evaluating the model. :type dropped_features: Optional[List[str]] """ - self._true_y = dataset[target_column] - self._dataset = dataset.drop(columns=[target_column]) + self._true_y = None if target_column is None else dataset[target_column] + if self._true_y is None: + self._dataset = dataset.copy() + else: + self._dataset = dataset.drop(columns=[target_column]) self._feature_names = list(self._dataset.columns) self._model_task = model_task self._classes = classes diff --git a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py index 552f31f2ff..7abd6a2c25 100644 --- a/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py +++ b/responsibleai_text/responsibleai_text/managers/error_analysis_manager.py @@ -231,7 +231,8 @@ def _create_index_predictor(model, dataset, target_column, :return: A wrapped predictor that uses index to retrieve text data. :rtype: WrappedIndexPredictorModel """ - dataset = dataset.drop(columns=[target_column]) + if target_column is not None: + dataset = dataset.drop(columns=[target_column]) dataset = get_text_columns(dataset, text_column) index_predictor = WrappedIndexPredictorModel( model, dataset, is_multilabel, task_type, classes) diff --git a/responsibleai_text/responsibleai_text/managers/explainer_manager.py b/responsibleai_text/responsibleai_text/managers/explainer_manager.py index 9160f823e2..e33f077aea 100644 --- a/responsibleai_text/responsibleai_text/managers/explainer_manager.py +++ b/responsibleai_text/responsibleai_text/managers/explainer_manager.py @@ -74,10 +74,13 @@ def __init__(self, model: Any, evaluation_examples: pd.DataFrame, """ self._model = model self._target_column = target_column - if not isinstance(target_column, list): + if not isinstance(target_column, (list, type(None))): target_column = [target_column] - self._evaluation_examples = \ - evaluation_examples.drop(columns=target_column) + if target_column is None: + self._evaluation_examples = evaluation_examples + else: + self._evaluation_examples = \ + evaluation_examples.drop(columns=target_column) self._is_run = False self._is_added = False self._features = list(self._evaluation_examples.columns) diff --git a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py index 172bae9a90..a37faedd7c 100644 --- a/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py +++ b/responsibleai_text/responsibleai_text/rai_text_insights/rai_text_insights.py @@ -183,7 +183,8 @@ def __init__(self, model: Any, test: pd.DataFrame, self._ext_test = ext_test self._ext_features = ext_features self._ext_test_df = pd.DataFrame(ext_test, columns=ext_features) - self._ext_test_df[target_column] = test[target_column] + if target_column is not None: + self._ext_test_df[target_column] = test[target_column] self.predict_output = None super(RAITextInsights, self).__init__( @@ -266,11 +267,13 @@ def _validate_model(self, model: Any, test: pd.DataFrame, an exception will be raised. :type text_column: str or list[str] """ - if not isinstance(target_column, list): - target_column = [target_column] - # Pick one row from test data - small_test_data = test.iloc[0:1].drop( - target_column, axis=1) + small_test_data = test.iloc[0:1] + if target_column is not None: + if not isinstance(target_column, list): + target_column = [target_column] + # Pick one row from test data + small_test_data = small_test_data.drop( + target_column, axis=1) small_test_data = get_text_columns(small_test_data, text_column) small_test_data = small_test_data.iloc[0] if task_type not in [ @@ -369,6 +372,9 @@ def _validate_rai_insights_input_parameters( if not target_columns_set.issubset(set(test.columns)): raise UserConfigValidationException( 'The list of target_column(s) should be in test data') + elif task_type==ModelTask.GENERATIVE_TEXT.value and target_column is None: + # target column is optional for generative text + pass else: if target_column not in list(test.columns): raise UserConfigValidationException( @@ -522,7 +528,10 @@ def _get_test_text_data(self, is_classification_task): elif self.task_type == ModelTask.QUESTION_ANSWERING: dataset = self.test.drop([self.target_column], axis=1) elif self.task_type == ModelTask.GENERATIVE_TEXT: - dataset = self.test.drop([self.target_column], axis=1) + if self.target_column is None: + dataset = self.test.copy() + else: + dataset = self.test.drop([self.target_column], axis=1) else: raise ValueError("Unknown task type: {}".format(self.task_type)) dataset = get_text_columns(dataset, self._text_column) @@ -578,13 +587,16 @@ def _get_dataset(self): dashboard_dataset.features = self._ext_test - true_y = self.test[self.target_column] - if true_y is not None and len(true_y) == row_length: - true_y = convert_to_list(true_y) - if is_classification_task: - true_y = self._convert_labels( - true_y, dashboard_dataset.class_names) - dashboard_dataset.true_y = true_y + if self.target_column is None: + dashboard_dataset.true_y = None + else: + true_y = self.test[self.target_column] + if true_y is not None and len(true_y) == row_length: + true_y = convert_to_list(true_y) + if is_classification_task: + true_y = self._convert_labels( + true_y, dashboard_dataset.class_names) + dashboard_dataset.true_y = true_y dashboard_dataset.feature_names = self._ext_features dashboard_dataset.target_column = self.target_column diff --git a/responsibleai_text/responsibleai_text/utils/feature_extractors.py b/responsibleai_text/responsibleai_text/utils/feature_extractors.py index 99cab441db..afea5eb9b2 100644 --- a/responsibleai_text/responsibleai_text/utils/feature_extractors.py +++ b/responsibleai_text/responsibleai_text/utils/feature_extractors.py @@ -13,7 +13,8 @@ from nlp_feature_extractors import attribute_extractors as exts from responsibleai_text.common.constants import (ModelTask, - QuestionAnsweringFields) + QuestionAnsweringFields, + GenerativeTextFields) nlp = None @@ -62,17 +63,7 @@ def extract_features(text_dataset: pd.DataFrame, feature_names.append("context_overlap") elif task_type == ModelTask.GENERATIVE_TEXT: # TODO: Add feature names for generative text - start_meta_index += 1 - feature_names = [] - prefixes = [QuestionAnsweringFields.CONTEXT + "_", - QuestionAnsweringFields.QUESTION + "_"] - for prefix in prefixes: - for feature_name in base_feature_names: - feature_names.append(prefix + feature_name) - feature_names.append(prefix + "average_parse_tree_depth") - feature_names.append(prefix + "maximum_parse_tree_depth") - feature_names.append("question_type") - feature_names.append("context_overlap") + feature_names = base_feature_names else: raise ValueError("Unknown task type: {}".format(task_type)) # copy over the metadata column names @@ -80,9 +71,13 @@ def extract_features(text_dataset: pd.DataFrame, if has_dropped_features and column_names[j] in dropped_features: continue feature_names.append(column_names[j]) - if not isinstance(target_column, list): + + if not isinstance(target_column, (list, type(None))): target_column = [target_column] - text_features = text_dataset.drop(target_column, axis=1) + + text_features = text_dataset.copy() + if target_column is not None: + text_features = text_features.drop(target_column, axis=1) if task_type in single_text_col_tasks: sentences = text_features.iloc[:, 0].tolist() @@ -116,7 +111,17 @@ def extract_features(text_dataset: pd.DataFrame, results.append(extracted_features) elif task_type == ModelTask.GENERATIVE_TEXT: # TODO: Add feature extraction for generative text - pass + for i, row in tqdm(text_features.iterrows(), desc='feature extraction'): + extracted_features = [] + add_extracted_features_for_sentence( + row[GenerativeTextFields.PROMPT], extracted_features, + task_type) + + # append all other metadata features + append_metadata_values(start_meta_index, text_dataset, i, + extracted_features, has_dropped_features, + dropped_features, column_names) + results.append(extracted_features) else: raise ValueError("Unknown task type: {}".format(task_type)) return results, feature_names