Skip to content

Commit

Permalink
remove target_column requirement for genai tasks (#2501)
Browse files Browse the repository at this point in the history
Signed-off-by: Kartik Choudhary <[email protected]>
  • Loading branch information
kartik727 authored and imatiach-msft committed Jan 23, 2024
1 parent 99cc0fe commit 33dc0f6
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 35 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -253,8 +253,11 @@ def __init__(self, model: Any, dataset: pd.DataFrame, target_column: str,
for evaluating the model.
:type dropped_features: Optional[List[str]]
"""
self._true_y = dataset[target_column]
self._dataset = dataset.drop(columns=[target_column])
self._true_y = None if target_column is None else dataset[target_column]
if self._true_y is None:
self._dataset = dataset.copy()
else:
self._dataset = dataset.drop(columns=[target_column])
self._feature_names = list(self._dataset.columns)
self._model_task = model_task
self._classes = classes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ def _create_index_predictor(model, dataset, target_column,
:return: A wrapped predictor that uses index to retrieve text data.
:rtype: WrappedIndexPredictorModel
"""
dataset = dataset.drop(columns=[target_column])
if target_column is not None:
dataset = dataset.drop(columns=[target_column])
dataset = get_text_columns(dataset, text_column)
index_predictor = WrappedIndexPredictorModel(
model, dataset, is_multilabel, task_type, classes)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,13 @@ def __init__(self, model: Any, evaluation_examples: pd.DataFrame,
"""
self._model = model
self._target_column = target_column
if not isinstance(target_column, list):
if not isinstance(target_column, (list, type(None))):
target_column = [target_column]
self._evaluation_examples = \
evaluation_examples.drop(columns=target_column)
if target_column is None:
self._evaluation_examples = evaluation_examples
else:
self._evaluation_examples = \
evaluation_examples.drop(columns=target_column)
self._is_run = False
self._is_added = False
self._features = list(self._evaluation_examples.columns)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,8 @@ def __init__(self, model: Any, test: pd.DataFrame,
self._ext_test = ext_test
self._ext_features = ext_features
self._ext_test_df = pd.DataFrame(ext_test, columns=ext_features)
self._ext_test_df[target_column] = test[target_column]
if target_column is not None:
self._ext_test_df[target_column] = test[target_column]
self.predict_output = None

super(RAITextInsights, self).__init__(
Expand Down Expand Up @@ -266,11 +267,13 @@ def _validate_model(self, model: Any, test: pd.DataFrame,
an exception will be raised.
:type text_column: str or list[str]
"""
if not isinstance(target_column, list):
target_column = [target_column]
# Pick one row from test data
small_test_data = test.iloc[0:1].drop(
target_column, axis=1)
small_test_data = test.iloc[0:1]
if target_column is not None:
if not isinstance(target_column, list):
target_column = [target_column]
# Pick one row from test data
small_test_data = small_test_data.drop(
target_column, axis=1)
small_test_data = get_text_columns(small_test_data, text_column)
small_test_data = small_test_data.iloc[0]
if task_type not in [
Expand Down Expand Up @@ -369,6 +372,9 @@ def _validate_rai_insights_input_parameters(
if not target_columns_set.issubset(set(test.columns)):
raise UserConfigValidationException(
'The list of target_column(s) should be in test data')
elif task_type==ModelTask.GENERATIVE_TEXT.value and target_column is None:
# target column is optional for generative text
pass
else:
if target_column not in list(test.columns):
raise UserConfigValidationException(
Expand Down Expand Up @@ -522,7 +528,10 @@ def _get_test_text_data(self, is_classification_task):
elif self.task_type == ModelTask.QUESTION_ANSWERING:
dataset = self.test.drop([self.target_column], axis=1)
elif self.task_type == ModelTask.GENERATIVE_TEXT:
dataset = self.test.drop([self.target_column], axis=1)
if self.target_column is None:
dataset = self.test.copy()
else:
dataset = self.test.drop([self.target_column], axis=1)
else:
raise ValueError("Unknown task type: {}".format(self.task_type))
dataset = get_text_columns(dataset, self._text_column)
Expand Down Expand Up @@ -578,13 +587,16 @@ def _get_dataset(self):

dashboard_dataset.features = self._ext_test

true_y = self.test[self.target_column]
if true_y is not None and len(true_y) == row_length:
true_y = convert_to_list(true_y)
if is_classification_task:
true_y = self._convert_labels(
true_y, dashboard_dataset.class_names)
dashboard_dataset.true_y = true_y
if self.target_column is None:
dashboard_dataset.true_y = None
else:
true_y = self.test[self.target_column]
if true_y is not None and len(true_y) == row_length:
true_y = convert_to_list(true_y)
if is_classification_task:
true_y = self._convert_labels(
true_y, dashboard_dataset.class_names)
dashboard_dataset.true_y = true_y

dashboard_dataset.feature_names = self._ext_features
dashboard_dataset.target_column = self.target_column
Expand Down
35 changes: 20 additions & 15 deletions responsibleai_text/responsibleai_text/utils/feature_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from nlp_feature_extractors import attribute_extractors as exts
from responsibleai_text.common.constants import (ModelTask,
QuestionAnsweringFields)
QuestionAnsweringFields,
GenerativeTextFields)

nlp = None

Expand Down Expand Up @@ -62,27 +63,21 @@ def extract_features(text_dataset: pd.DataFrame,
feature_names.append("context_overlap")
elif task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Add feature names for generative text
start_meta_index += 1
feature_names = []
prefixes = [QuestionAnsweringFields.CONTEXT + "_",
QuestionAnsweringFields.QUESTION + "_"]
for prefix in prefixes:
for feature_name in base_feature_names:
feature_names.append(prefix + feature_name)
feature_names.append(prefix + "average_parse_tree_depth")
feature_names.append(prefix + "maximum_parse_tree_depth")
feature_names.append("question_type")
feature_names.append("context_overlap")
feature_names = base_feature_names
else:
raise ValueError("Unknown task type: {}".format(task_type))
# copy over the metadata column names
for j in range(start_meta_index, text_dataset.shape[1]):
if has_dropped_features and column_names[j] in dropped_features:
continue
feature_names.append(column_names[j])
if not isinstance(target_column, list):

if not isinstance(target_column, (list, type(None))):
target_column = [target_column]
text_features = text_dataset.drop(target_column, axis=1)

text_features = text_dataset.copy()
if target_column is not None:
text_features = text_features.drop(target_column, axis=1)

if task_type in single_text_col_tasks:
sentences = text_features.iloc[:, 0].tolist()
Expand Down Expand Up @@ -116,7 +111,17 @@ def extract_features(text_dataset: pd.DataFrame,
results.append(extracted_features)
elif task_type == ModelTask.GENERATIVE_TEXT:
# TODO: Add feature extraction for generative text
pass
for i, row in tqdm(text_features.iterrows(), desc='feature extraction'):
extracted_features = []
add_extracted_features_for_sentence(
row[GenerativeTextFields.PROMPT], extracted_features,
task_type)

# append all other metadata features
append_metadata_values(start_meta_index, text_dataset, i,
extracted_features, has_dropped_features,
dropped_features, column_names)
results.append(extracted_features)
else:
raise ValueError("Unknown task type: {}".format(task_type))
return results, feature_names
Expand Down

0 comments on commit 33dc0f6

Please sign in to comment.