-
Notifications
You must be signed in to change notification settings - Fork 162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
New Data Labeler: ColumnNameModel Build #626
Changes from 8 commits
8e07b27
ff39b3f
30ca099
2c6ae44
5e707ed
b47cdda
1cef721
be0fd7a
2455675
b6a3f28
69c565b
38a0456
df50a7b
55ed522
4f0225b
96305de
8cfefc5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,216 @@ | ||
"""Contains class for column name data labeling model.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. whole new model -- based on regex_model.py to some extent |
||
import copy | ||
import json | ||
from operator import neg | ||
import os | ||
import re | ||
import sys | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from rapidfuzz import process, fuzz | ||
|
||
from .. import dp_logging | ||
from .base_model import AutoSubRegistrationMeta, BaseModel | ||
|
||
logger = dp_logging.get_child_logger(__name__) | ||
|
||
|
||
class ColumnNameModel(BaseModel, metaclass=AutoSubRegistrationMeta): | ||
"""Class for column name data labeling model.""" | ||
|
||
def __init__(self, parameters=None): | ||
r""" | ||
:param parameters: Contains all the appropriate parameters for the model. | ||
Possible parameters are: | ||
max_length, max_num_chars, dim_embed | ||
:type parameters: dict | ||
:return: None | ||
""" | ||
# parameter initialization | ||
if not parameters: | ||
parameters = {} | ||
parameters.setdefault('false_positive_dict', None) | ||
parameters.setdefault('true_positive_dict', None) | ||
|
||
# initialize class | ||
self._validate_parameters(parameters) | ||
self._parameters = parameters | ||
|
||
def _validate_parameters(self, parameters): | ||
r""" | ||
Validate the parameters sent in. | ||
|
||
Raise error if invalid parameters are present. | ||
|
||
:param parameters: parameter dict containing the following parameters: | ||
true_positive_dict | ||
false_positive_dict | ||
:type parameters: dict | ||
:return: None | ||
""" | ||
errors = [] | ||
|
||
list_of_accepted_parameters = [ | ||
"true_positive_dict", | ||
"false_positive_dict", | ||
] | ||
|
||
for param in parameters: | ||
value = parameters[param] | ||
if param == "false_positive_dict" and value != None and ( | ||
not isinstance(value, list) | ||
or 'attribute' not in value[0].keys() | ||
): | ||
errors.append( | ||
"`{}` must be a list of dictionaries with at least 'attribute' as the key".format(param) | ||
) | ||
elif param == "true_positive_dict" and ( | ||
not isinstance(value, list) | ||
or not isinstance(value[0], dict) | ||
): | ||
errors.append( | ||
"""`{}` must be a list of dictionaries each with the following | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't check for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixing -- good catch |
||
two keys: 'attribute' and 'label'""".format(param) | ||
) | ||
elif param not in list_of_accepted_parameters: | ||
errors.append("`{}` is not an accepted parameter.".format(param)) | ||
if errors: | ||
raise ValueError("\n".join(errors)) | ||
|
||
@staticmethod | ||
def _make_lower_case(str, **kwargs): | ||
return str.lower() | ||
|
||
def _compare_negative(self, list_of_column_names, check_values_dict, negative_threshold): | ||
"""Filter out column name examples that are false positives""" | ||
scores = self._model( | ||
list_of_column_names, | ||
check_values_dict, | ||
self._make_lower_case, | ||
fuzz.token_sort_ratio) | ||
|
||
list_of_column_names_filtered = [] | ||
for i in range(len(list_of_column_names)): | ||
if scores[i][0] < negative_threshold: | ||
list_of_column_names_filtered.append(list_of_column_names[i]) | ||
|
||
return list_of_column_names_filtered | ||
|
||
def _compare_positive(self, list_of_column_names, check_values_dict, positive_threshold, include_label, show_confidences): | ||
"""Calculate similarity scores between list of column names and true positive examples""" | ||
|
||
scores = self._model( | ||
list_of_column_names, | ||
check_values_dict, | ||
self._make_lower_case, | ||
fuzz.token_sort_ratio, | ||
include_label=include_label, | ||
) | ||
|
||
output_dictionary = {} | ||
for i in range(len(list_of_column_names)): | ||
if scores[i][0] > positive_threshold: | ||
output_dictionary[list_of_column_names[i]] = {} | ||
output_dictionary[list_of_column_names[i]]['pred'] = \ | ||
check_values_dict[scores[i][1]]['label'] | ||
if show_confidences: | ||
output_dictionary[list_of_column_names[i]]['conf'] = scores[i][0] | ||
|
||
return output_dictionary | ||
|
||
def _construct_model(self): | ||
pass | ||
|
||
def _reconstruct_model(self): | ||
pass | ||
|
||
def _need_to_reconstruct_model(self): | ||
pass | ||
|
||
def reset_weights(self): | ||
pass | ||
|
||
def _model(self, list_of_column_names, check_values_dict, processor, scorer, include_label=False): | ||
scores = [] | ||
|
||
check_values_list = [dict['attribute'] for dict in check_values_dict] | ||
|
||
model_outputs = process.cdist(list_of_column_names, | ||
check_values_list, | ||
processor=processor, | ||
scorer=scorer) | ||
|
||
for iter_value, ngram_match_results in enumerate(model_outputs): | ||
column_result = [np.max(ngram_match_results)] | ||
if include_label: | ||
index_max_result = ngram_match_results.argmax(axis=0) | ||
column_result.append(index_max_result) | ||
scores.append(column_result) | ||
|
||
return scores | ||
|
||
def predict(self, data, batch_size=None, show_confidences=False, verbose=True, include_label=True): | ||
""" | ||
Apply the `process.cdist` for similarity score on input list of strings. | ||
|
||
:param data: list of strings to predict upon | ||
:type data: iterator | ||
:param batch_size: does not impact this model and should be fixed to not | ||
be required. | ||
:type batch_size: N/A | ||
:param show_confidences: whether user wants prediction confidences | ||
:type show_confidences: | ||
:param verbose: Flag to determine whether to print status or not | ||
:type verbose: bool | ||
:return: char level predictions and confidences | ||
:rtype: dict | ||
""" | ||
false_positive_dict = self._parameters['false_positive_dict'] | ||
if false_positive_dict: | ||
data = self._compare_negative(data, false_positive_dict, negative_threshold=50) | ||
if verbose: | ||
logger.info("compare_negative process complete") | ||
|
||
output = self._compare_positive( | ||
data, | ||
self._parameters['true_positive_dict'], | ||
positive_threshold=85, | ||
include_label=True, | ||
show_confidences=show_confidences) | ||
if verbose: | ||
logger.info("compare_positive process complete") | ||
|
||
return output | ||
|
||
@classmethod | ||
def load_from_disk(cls, dirpath): | ||
""" | ||
Load whole model from disk with weights. | ||
|
||
:param dirpath: directory path where you want to load the model from | ||
:type dirpath: str | ||
:return: None | ||
""" | ||
# load parameters | ||
model_param_dirpath = os.path.join(dirpath, "model_parameters.json") | ||
with open(model_param_dirpath, "r") as fp: | ||
parameters = json.load(fp) | ||
|
||
loaded_model = cls(parameters) | ||
return loaded_model | ||
|
||
def save_to_disk(self, dirpath): | ||
""" | ||
Save whole model to disk with weights. | ||
|
||
:param dirpath: directory path where you want to save the model to | ||
:type dirpath: str | ||
:return: None | ||
""" | ||
if not os.path.isdir(dirpath): | ||
os.makedirs(dirpath) | ||
|
||
model_param_dirpath = os.path.join(dirpath, "model_parameters.json") | ||
with open(model_param_dirpath, "w") as fp: | ||
json.dump(self._parameters, fp) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
adding the model to be loadable as a class