Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove target_column requirement for genai tasks #2501

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,11 @@ def __init__(self, model: Any, dataset: pd.DataFrame, target_column: str,
for evaluating the model.
:type dropped_features: Optional[List[str]]
"""
self._true_y = dataset[target_column]
self._dataset = dataset.drop(columns=[target_column])
self._true_y = None if target_column is None else dataset[target_column]
if self._true_y is None:
self._dataset = dataset.copy()
else:
self._dataset = dataset.drop(columns=[target_column])
self._feature_names = list(self._dataset.columns)
self._model_task = model_task
self._classes = classes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ def _create_index_predictor(model, dataset, target_column,
:return: A wrapped predictor that uses index to retrieve text data.
:rtype: WrappedIndexPredictorModel
"""
dataset = dataset.drop(columns=[target_column])
if target_column is not None:
dataset = dataset.drop(columns=[target_column])
dataset = get_text_columns(dataset, text_column)
index_predictor = WrappedIndexPredictorModel(
model, dataset, is_multilabel, task_type, classes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,13 @@ def __init__(self, model: Any, evaluation_examples: pd.DataFrame,
"""
self._model = model
self._target_column = target_column
if not isinstance(target_column, list):
if not isinstance(target_column, (list, type(None))):
target_column = [target_column]
self._evaluation_examples = \
evaluation_examples.drop(columns=target_column)
if target_column is None:
self._evaluation_examples = evaluation_examples
else:
self._evaluation_examples = \
evaluation_examples.drop(columns=target_column)
self._is_run = False
self._is_added = False
self._features = list(self._evaluation_examples.columns)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def __init__(self, model: Any, test: pd.DataFrame,
self._ext_test = ext_test
self._ext_features = ext_features
self._ext_test_df = pd.DataFrame(ext_test, columns=ext_features)
self._ext_test_df[target_column] = test[target_column]
if target_column is not None:
self._ext_test_df[target_column] = test[target_column]
self.predict_output = None

super(RAITextInsights, self).__init__(
Expand Down Expand Up @@ -266,11 +267,13 @@ def _validate_model(self, model: Any, test: pd.DataFrame,
an exception will be raised.
:type text_column: str or list[str]
"""
if not isinstance(target_column, list):
target_column = [target_column]
# Pick one row from test data
small_test_data = test.iloc[0:1].drop(
target_column, axis=1)
small_test_data = test.iloc[0:1]
if target_column is not None:
if not isinstance(target_column, list):
target_column = [target_column]
# Pick one row from test data
small_test_data = small_test_data.drop(
target_column, axis=1)
small_test_data = get_text_columns(small_test_data, text_column)
small_test_data = small_test_data.iloc[0]
if task_type not in [
Expand Down Expand Up @@ -369,6 +372,9 @@ def _validate_rai_insights_input_parameters(
if not target_columns_set.issubset(set(test.columns)):
raise UserConfigValidationException(
'The list of target_column(s) should be in test data')
elif task_type==ModelTask.GENERATIVE_TEXT.value and target_column is None:
# target column is optional for generative text
pass
else:
if target_column not in list(test.columns):
raise UserConfigValidationException(
Expand Down Expand Up @@ -522,7 +528,10 @@ def _get_test_text_data(self, is_classification_task):
elif self.task_type == ModelTask.QUESTION_ANSWERING:
dataset = self.test.drop([self.target_column], axis=1)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
dataset = self.test.drop([self.target_column], axis=1)
if self.target_column is None:
dataset = self.test.copy()
else:
dataset = self.test.drop([self.target_column], axis=1)
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
dataset = get_text_columns(dataset, self._text_column)
Expand Down Expand Up @@ -578,13 +587,16 @@ def _get_dataset(self):

dashboard_dataset.features = self._ext_test

true_y = self.test[self.target_column]
if true_y is not None and len(true_y) == row_length:
true_y = convert_to_list(true_y)
if is_classification_task:
true_y = self._convert_labels(
true_y, dashboard_dataset.class_names)
dashboard_dataset.true_y = true_y
if self.target_column is None:
dashboard_dataset.true_y = None
else:
true_y = self.test[self.target_column]
if true_y is not None and len(true_y) == row_length:
true_y = convert_to_list(true_y)
if is_classification_task:
true_y = self._convert_labels(
true_y, dashboard_dataset.class_names)
dashboard_dataset.true_y = true_y

dashboard_dataset.feature_names = self._ext_features
dashboard_dataset.target_column = self.target_column
Expand Down
35 changes: 20 additions & 15 deletions responsibleai_text/responsibleai_text/utils/feature_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from nlp_feature_extractors import attribute_extractors as exts
from responsibleai_text.common.constants import (ModelTask,
QuestionAnsweringFields)
QuestionAnsweringFields,
GenerativeTextFields)

nlp = None

Expand Down Expand Up @@ -62,27 +63,21 @@ def extract_features(text_dataset: pd.DataFrame,
feature_names.append("context_overlap")
elif task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Add feature names for generative text
start_meta_index += 1
feature_names = []
prefixes = [QuestionAnsweringFields.CONTEXT + "_",
QuestionAnsweringFields.QUESTION + "_"]
for prefix in prefixes:
for feature_name in base_feature_names:
feature_names.append(prefix + feature_name)
feature_names.append(prefix + "average_parse_tree_depth")
feature_names.append(prefix + "maximum_parse_tree_depth")
feature_names.append("question_type")
feature_names.append("context_overlap")
feature_names = base_feature_names
else:
raise ValueError("Unknown task type: {}".format(task_type))
# copy over the metadata column names
for j in range(start_meta_index, text_dataset.shape[1]):
if has_dropped_features and column_names[j] in dropped_features:
continue
feature_names.append(column_names[j])
if not isinstance(target_column, list):

if not isinstance(target_column, (list, type(None))):
target_column = [target_column]
text_features = text_dataset.drop(target_column, axis=1)

text_features = text_dataset.copy()
if target_column is not None:
text_features = text_features.drop(target_column, axis=1)

if task_type in single_text_col_tasks:
sentences = text_features.iloc[:, 0].tolist()
Expand Down Expand Up @@ -116,7 +111,17 @@ def extract_features(text_dataset: pd.DataFrame,
results.append(extracted_features)
elif task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Add feature extraction for generative text
pass
for i, row in tqdm(text_features.iterrows(), desc='feature extraction'):
extracted_features = []
add_extracted_features_for_sentence(
row[GenerativeTextFields.PROMPT], extracted_features,
task_type)

# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
dropped_features, column_names)
results.append(extracted_features)
else:
raise ValueError("Unknown task type: {}".format(task_type))
return results, feature_names
Expand Down