From 832c9572b01fbe2911b4aea055f0010bd0382998 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Thu, 31 Mar 2022 16:31:47 -0500 Subject: [PATCH 01/14] feat: new model and pep8 --- .../labelers/character_level_cnn_model.py | 21 +- .../labelers/pre_encoded_char_cnn_model.py | 839 ++++++++++++++++++ 2 files changed, 858 insertions(+), 2 deletions(-) create mode 100644 dataprofiler/labelers/pre_encoded_char_cnn_model.py diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 8a9f164ba..4a9e84311 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -5,6 +5,7 @@ import sys import time from collections import defaultdict +import functools import numpy as np import tensorflow as tf @@ -34,7 +35,23 @@ def filter(self, record): tf_logger.addFilter(NoV1ResourceMessageFilter()) -@tf.keras.utils.register_keras_serializable() +def protected_register_keras_serializable(package='Custom', name=None): + """ + Protects against already registered keras serializable layers. This + ensures that if it was already registered, it will not try to register it + again. + """ + def decorator(arg): + """Protects against double registration of a keras layer.""" + class_name = name if name is not None else arg.__name__ + registered_name = package + '>' + class_name + if tf.keras.utils.get_registered_object(registered_name) is None: + tf.keras.utils.register_keras_serializable(package, name)(arg) + return arg + return decorator + + +@protected_register_keras_serializable() class FBetaScore(tf.keras.metrics.Metric): r"""Computes F-Beta score. Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 @@ -198,7 +215,7 @@ def reset_states(self): tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) -@tf.keras.utils.register_keras_serializable() +@protected_register_keras_serializable() class F1Score(FBetaScore): r"""Computes F-1 Score. diff --git a/dataprofiler/labelers/pre_encoded_char_cnn_model.py b/dataprofiler/labelers/pre_encoded_char_cnn_model.py new file mode 100644 index 000000000..a8ecb96de --- /dev/null +++ b/dataprofiler/labelers/pre_encoded_char_cnn_model.py @@ -0,0 +1,839 @@ +import json +import copy +import os +import sys +import time +import logging +from collections import defaultdict +import functools + +import tensorflow as tf +import numpy as np +from sklearn import decomposition + +from . import labeler_utils +from .base_model import BaseModel, BaseTrainableModel +from .base_model import AutoSubRegistrationMeta +from .. import dp_logging + +_file_dir = os.path.dirname(os.path.abspath(__file__)) + +logger = dp_logging.get_child_logger(__name__) + + +class NoV1ResourceMessageFilter(logging.Filter): + """Removes TF2 warning for using TF1 model which has resources.""" + def filter(self, record): + msg = 'is a problem, consider rebuilding the SavedModel after ' + \ + 'running tf.compat.v1.enable_resource_variables()' + return msg not in record.getMessage() + + +tf_logger = logging.getLogger('tensorflow') +tf_logger.addFilter(NoV1ResourceMessageFilter()) + + +def protected_register_keras_serializable(package='Custom', name=None): + """ + Protects against already registered keras serializable layers. This + ensures that if it was already registered, it will not try to register it + again. + """ + def decorator(arg): + """Protects against double registration of a keras layer.""" + class_name = name if name is not None else arg.__name__ + registered_name = package + '>' + class_name + if tf.keras.utils.get_registered_object(registered_name) is None: + tf.keras.utils.register_keras_serializable(package, name)(arg) + return arg + return decorator + + +@protected_register_keras_serializable() +class FBetaScore(tf.keras.metrics.Metric): + r"""Computes F-Beta score. + Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the weighted harmonic mean of precision + and recall. Output range is `[0, 1]`. Works for + both multi-class and multi-label classification. + $$ + F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` and + `weighted`. Default value is None. + beta: Determines the weight of precision and recall + in harmonic mean. Determines the weight given to the + precision and recall. Default value is 1. + threshold: Elements of `y_pred` greater than threshold are + converted to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-Beta Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, beta=1.0, threshold=None, + name="fbeta_score", dtype=None, **kwargs): + super().__init__(name=name, dtype=dtype) + + if average not in (None, "micro", "macro", "weighted"): + raise ValueError( + "Unknown average type. Acceptable values " + "are: [None, 'micro', 'macro', 'weighted']" + ) + + if not isinstance(beta, float): + raise TypeError("The value of beta should be a python float") + + if beta <= 0.0: + raise ValueError("beta value should be greater than zero") + + if threshold is not None: + if not isinstance(threshold, float): + raise TypeError("The value of threshold should be a python float") + if threshold > 1.0 or threshold <= 0.0: + raise ValueError("threshold should be between 0 and 1") + + self.num_classes = num_classes + self.average = average + self.beta = beta + self.threshold = threshold + self.axis = None + self.init_shape = [] + + if self.average != "micro": + self.axis = 0 + self.init_shape = [self.num_classes] + + def _zero_wt_init(name): + return self.add_weight( + name, shape=self.init_shape, initializer="zeros", dtype=self.dtype + ) + + self.true_positives = _zero_wt_init("true_positives") + self.false_positives = _zero_wt_init("false_positives") + self.false_negatives = _zero_wt_init("false_negatives") + self.weights_intermediate = _zero_wt_init("weights_intermediate") + + def update_state(self, y_true, y_pred, sample_weight=None): + if self.threshold is None: + threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) + # make sure [0, 0, 0] doesn't become [1, 1, 1] + # Use abs(x) > eps, instead of x != 0 to check for zero + y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) + else: + y_pred = y_pred > self.threshold + + y_true = tf.cast(y_true, self.dtype) + y_pred = tf.cast(y_pred, self.dtype) + + def _weighted_sum(val, sample_weight): + if sample_weight is not None: + val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) + return tf.reduce_sum(val, axis=self.axis) + + self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) + self.false_positives.assign_add( + _weighted_sum(y_pred * (1 - y_true), sample_weight) + ) + self.false_negatives.assign_add( + _weighted_sum((1 - y_pred) * y_true, sample_weight) + ) + self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) + + def result(self): + precision = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_positives + ) + recall = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_negatives + ) + + mul_value = precision * recall + add_value = (tf.math.square(self.beta) * precision) + recall + mean = tf.math.divide_no_nan(mul_value, add_value) + f1_score = mean * (1 + tf.math.square(self.beta)) + + if self.average == "weighted": + weights = tf.math.divide_no_nan( + self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) + ) + f1_score = tf.reduce_sum(f1_score * weights) + + elif self.average is not None: # [micro, macro] + f1_score = tf.reduce_mean(f1_score) + + return f1_score + + def get_config(self): + """Returns the serializable config of the metric.""" + + config = { + "num_classes": self.num_classes, + "average": self.average, + "beta": self.beta, + "threshold": self.threshold, + } + + base_config = super().get_config() + return {**base_config, **config} + + def reset_states(self): + reset_value = tf.zeros(self.init_shape, dtype=self.dtype) + tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) + + +@protected_register_keras_serializable() +class F1Score(FBetaScore): + r"""Computes F-1 Score. + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the harmonic mean of precision and recall. + Output range is `[0, 1]`. Works for both multi-class + and multi-label classification. + $$ + F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` + and `weighted`. Default value is None. + threshold: Elements of `y_pred` above threshold are + considered to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-1 Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, threshold=None, + name="f1_score", dtype=None): + super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) + + def get_config(self): + base_config = super().get_config() + del base_config["beta"] + return base_config + + +class PreEncodedCharCnnModel(BaseTrainableModel, + metaclass=AutoSubRegistrationMeta): + + # boolean if the label mapping requires the mapping for index 0 reserved + requires_zero_mapping = True + + def __init__(self, label_mapping=None, parameters=None): + """ + CNN Model Initializer. initialize epoch_id + + :param label_mapping: maps labels to their encoded integers + :type label_mapping: dict + :param parameters: Contains all the appropriate parameters for the + model. Must contain num_labels. Other possible parameters are: + max_length, max_char_encoding_id, dim_embed, size_fc + dropout, size_conv, num_fil, optimizer, default_label + :type parameters: dict + :return: None + """ + + # parameter initialization + if not parameters: + parameters = {} + parameters.setdefault('max_length', 1014) + parameters.setdefault('alphabet_size', 69) + parameters.setdefault('dim_embed', 32) + parameters.setdefault('conv_layers', [ + [256, 7, 1], + [256, 7, 1], + [256, 3, -1], + [256, 3, -1], + [256, 3, -1], + [256, 3, 1] + ]) + parameters.setdefault('size_fc', [512, 512]) + parameters.setdefault('dropout', 0.5) + parameters.setdefault('threshold', 1e-6) + parameters.setdefault('default_label', "UNKNOWN") + parameters['pad_label'] = 'PAD' + self._epoch_id = 0 + + # reconstruct flags for model + self._model_num_labels = 0 + self._model_default_ind = -1 + + BaseModel.__init__(self, label_mapping, parameters) + + def __eq__(self, other): + """ + Checks if two models are equal with one another, may only check + important variables, i.e. may not check model itself. + + :param self: a model + :param other: a model + :type self: BaseModel + :type other: BaseModel + :return: Whether or not self and other are equal + :rtype: bool + """ + if self._parameters != other._parameters \ + or self._label_mapping != other._label_mapping: + return False + return True + + def _validate_parameters(self, parameters): + """ + Validate the parameters sent in. Raise error if invalid parameters are + present. + + :param parameters: parameter dict containing the following parameters: + max_length: Maximum char length in a sample + max_char_encoding_id: Maximum integer value for encoding the input + dim_embed: Number of embedded dimensions + size_fc: Size of each fully connected layers + dropout: Ratio of dropout in the model + size_conv: Convolution kernel size + default_label: Key for label_mapping that is the default label + pad_label: Key for entities_dict that is the pad label + num_fil: Number of filters in each convolution layer + :type parameters: dict + :return: None + """ + errors = [] + list_of_necessary_params = ['max_length', 'alphabet_size', + 'dim_embed', 'size_fc', 'dropout', + 'threshold', 'conv_layers', 'default_label', + 'pad_label'] + # Make sure the necessary parameters are present and valid. + for param in parameters: + if param in ['max_length', 'alphabet_size', 'dim_embed']: + if not isinstance(parameters[param], (int, float)) \ + or parameters[param] < 0: + errors.append(param + " must be a valid integer or float " + "greater than 0.") + elif param in ['dropout', 'threshold']: + if not isinstance(parameters[param], (int, float)) \ + or parameters[param] < 0 or parameters[param] > 1: + errors.append(param + " must be a valid integer or float " + "from 0 to 1.") + elif param == 'size_fc': + if not isinstance(parameters[param], list) \ + or len(parameters[param]) == 0: + errors.append(param + " must be a non-empty list of " + "integers.") + else: + for item in parameters[param]: + if not isinstance(item, int): + errors.append(param + " must be a non-empty " + "list of integers.") + break + elif param == 'conv_layers': + is_bad_conv_layers = True + if isinstance(parameters[param], list): + is_bad_conv_layers = False + for layer in parameters[param]: + if (not isinstance(layer, list) or len(layer) != 3 + or any([not isinstance(x, int) for x in layer])): + is_bad_conv_layers = True + if is_bad_conv_layers: + errors.append(param + " must be a non-empty list of " + "tuples containing 3 integers.") + elif param == 'default_label': + if not isinstance(parameters[param], str): + error = str(param) + " must be a string." + errors.append(error) + + # Error if there are extra parameters thrown in + for param in parameters: + if param not in list_of_necessary_params: + errors.append(param + " is not an accepted parameter.") + if errors: + raise ValueError('\n'.join(errors)) + + def set_label_mapping(self, label_mapping): + """ + Sets the labels for the model + + :param label_mapping: label mapping of the model + :type label_mapping: dict + :return: None + """ + if not isinstance(label_mapping, (list, dict)): + raise TypeError("Labels must either be a non-empty encoding dict " + "which maps labels to index encodings or a list.") + + label_mapping = copy.deepcopy(label_mapping) + if 'PAD' not in label_mapping: + if isinstance(label_mapping, list): # if list missing PAD + label_mapping = ['PAD'] + label_mapping + elif 0 not in label_mapping.values(): # if dict missing PAD and 0 + label_mapping.update({'PAD': 0}) + if (isinstance(label_mapping, dict) + and label_mapping.get('PAD', None) != 0): # dict with bad PAD + raise ValueError("`PAD` must map to index zero.") + if self._parameters['default_label'] not in label_mapping: + raise ValueError("The `default_label` of {} must exist in the " + "label mapping.".format( + self._parameters['default_label'])) + super().set_label_mapping(label_mapping) + + def _need_to_reconstruct_model(self): + """ + Determines whether or not the model needs to be reconstructed. + + :return: bool of whether or not the model needs to reconstruct. + """ + if not self._model: + return False + default_ind = self.label_mapping[self._parameters['default_label']] + return self.num_labels != self._model_num_labels or \ + default_ind != self._model_default_ind + + def save_to_disk(self, dirpath): + """ + Saves whole model to disk with weights + + :param dirpath: directory path where you want to save the model to + :type dirpath: str + :return: None + """ + if not self._model: + self._construct_model() + elif self._need_to_reconstruct_model(): + self._reconstruct_model() + + model_param_dirpath = os.path.join(dirpath, "model_parameters.json") + with open(model_param_dirpath, 'w') as fp: + json.dump(self._parameters, fp) + labels_dirpath = os.path.join(dirpath, "label_mapping.json") + with open(labels_dirpath, 'w') as fp: + json.dump(self.label_mapping, fp) + self._model.save(os.path.join(dirpath)) + + @classmethod + def load_from_disk(cls, dirpath): + """ + Loads whole model from disk with weights + + :param dirpath: directory path where you want to load the model from + :type dirpath: str + :return: None + """ + + # load parameters + model_param_dirpath = os.path.join(dirpath, "model_parameters.json") + with open(model_param_dirpath, 'r') as fp: + parameters = json.load(fp) + + # load label_mapping + labels_dirpath = os.path.join(dirpath, "label_mapping.json") + with open(labels_dirpath, 'r') as fp: + label_mapping = json.load(fp) + + # use f1 score metric + custom_objects = { + "F1Score": F1Score( + num_classes=max(label_mapping.values()) + 1, + average='micro'), + "CharacterLevelCnnModel": cls, + } + with tf.keras.utils.custom_object_scope(custom_objects): + tf_model = tf.keras.models.load_model(dirpath) + + loaded_model = cls(label_mapping, parameters) + loaded_model._model = tf_model + # + # # Tensorflow v1 Model weights need to be transferred. + # if not callable(tf_model): + # loaded_model._construct_model() + # tf1_weights = [] + # for var in tf_model.variables: + # if 'training' not in var.name: + # tf1_weights.append(var.value()) + # + # loaded_model._construct_model() + # tf1_weights.append(loaded_model._model.weights[-1].value()) + # loaded_model._model.set_weights(tf1_weights) + + # load self + loaded_model._model_num_labels = loaded_model.num_labels + loaded_model._model_default_ind = loaded_model.label_mapping[ + loaded_model._parameters['default_label'] + ] + return loaded_model + + def _construct_model(self): + """ + Model constructor for the data labeler. This also serves as a weight + reset. + + :return: None + """ + num_labels = self.num_labels + default_ind = self.label_mapping[self._parameters['default_label']] + + # default parameters + max_length = self._parameters['max_length'] + alphabet_size = self._parameters['alphabet_size'] + dim_embed = self._parameters['dim_embed'] + conv_layers = self._parameters['conv_layers'] + size_fc = self._parameters['size_fc'] + threshold = self._parameters['threshold'] + dropout = self._parameters['dropout'] + + # Reset model + tf.keras.backend.clear_session() + + # Input layer + inputs = tf.keras.layers.Input( + shape=(None,), name='sent_input', dtype='int64') + # Embedding layers + x_embedding = tf.keras.layers.Embedding( + alphabet_size + 1, dim_embed, input_length=max_length)(inputs) + + # Convolution layers + x = x_embedding + for cl in conv_layers: + x = tf.keras.layers.Convolution1D(cl[0], cl[1], padding='same')(x) + x = tf.keras.layers.ThresholdedReLU(threshold)(x) + if cl[2] != -1: + x = tf.keras.layers.MaxPooling1D(cl[2])(x) + # x = tf.keras.layers.Flatten()(x) + + # Fully connected layers + for fl in size_fc: + x_dense = tf.keras.layers.Dense(fl)(x) + x = tf.keras.layers.ThresholdedReLU(threshold)(x_dense) + x = tf.keras.layers.Dropout(dropout)(x) + + # Output layer + predictions = tf.keras.layers.Dense( + num_labels, activation='softmax', name='softmax_output')(x) + # argmax layer + argmax_layer = tf.keras.backend.argmax(predictions) + + # Build and compile model + self._model = tf.keras.models.Model( + inputs=inputs, outputs=[predictions, argmax_layer]) + + # Compile the model w/ metrics + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + losses = {softmax_output_layer_name: "categorical_crossentropy"} + + # use f1 score metric + f1_score_training = F1Score(num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + + self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + + self._epoch_id = 0 + self._model_num_labels = num_labels + self._model_default_ind = default_ind + + def reset_weights(self): + """ + Reset the weights of the model. + + :return: None + """ + self._construct_model() + + def _reconstruct_model(self): + """ + Reconstruct the appropriate layers if the number of number of labels is + altered + + :return: None + """ + + # Reset model + tf.keras.backend.clear_session() + + num_labels = self.num_labels + default_ind = self.label_mapping[self._parameters['default_label']] + + # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax') + for _ in range(2): + self._model.layers.pop() + + # Add the final Softmax layer to the previous spot + final_softmax_layer = tf.keras.layers.Dense( + num_labels, activation='softmax', name="softmax_output")( + self._model.layers[-4].output) + + # Output the model into a .pb file for TensorFlow + argmax_layer = tf.keras.backend.argmax(final_softmax_layer) + + + argmax_outputs = [final_softmax_layer, argmax_layer] + self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + + # Compile the model + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + losses = {softmax_output_layer_name: "categorical_crossentropy"} + + # use f1 score metric + f1_score_training = F1Score(num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + + self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + + self._epoch_id = 0 + self._model_num_labels = num_labels + self._model_default_ind = default_ind + + def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, + reset_weights=False, verbose=True): + """ + Train the current model with the training data and validation data + + :param train_data: Training data used to train model + :type train_data: Union[list, np.ndarray] + :param val_data: Validation data used to validate the training + :type val_data: Union[list, np.ndarray] + :param batch_size: Used to determine number of samples in each batch + :type batch_size: int + :param label_mapping: maps labels to their encoded integers + :type label_mapping: Union[dict, None] + :param reset_weights: Flag to determine whether to reset the weights or + not + :type reset_weights: bool + :param verbose: Flag to determine whether to print status or not + :type verbose: bool + :return: None + """ + + if label_mapping is not None: + self.set_label_mapping(label_mapping) + + if not self._model: + self._construct_model() + else: + if self._need_to_reconstruct_model(): + self._reconstruct_model() + if reset_weights: + self.reset_weights() + + history = defaultdict() + f1 = None + f1_report = [] + + self._model.reset_metrics() + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + + start_time = time.time() + batch_id = 0 + for x_train, y_train in train_data: + model_results = self._model.train_on_batch( + x_train, {softmax_output_layer_name: y_train}) + sys.stdout.flush() + if verbose: + sys.stdout.write( + "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " + "f1_score %f" % + (self._epoch_id, batch_id, *model_results[1:])) + batch_id += 1 + + for i, metric_label in enumerate(self._model.metrics_names): + history[metric_label] = model_results[i] + + if val_data: + f1, f1_report = self._validate_training(val_data) + history['f1_report'] = f1_report + + val_f1 = f1_report['weighted avg']['f1-score'] \ + if f1_report else np.NAN + val_precision = f1_report['weighted avg']['precision'] \ + if f1_report else np.NAN + val_recall = f1_report['weighted avg']['recall'] \ + if f1_report else np.NAN + epoch_time = time.time() - start_time + logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " + "val_f1: %f - val_precision: %f - val_recall %f" % + (self._epoch_id, epoch_time, *model_results[1:], + val_f1, val_precision, val_recall)) + + self._epoch_id += 1 + + return history, f1, f1_report + + def _validate_training(self, val_data, batch_size_test=32, + verbose_log=True, verbose_keras=False): + """ + Validate the model on the test set and return the evaluation metrics. + + :param val_data: data generator for the validation + :type val_data: iterator + :param batch_size_test: Number of samples to process in testing + :type batch_size_test: int + :param verbose_log: whether or not to print out scores for training, + etc. + :type verbose_log: bool + :param verbose_keras: whether or not to print out scores for training, + from keras. + :type verbose_keras: bool + return (f1-score, f1 report). + """ + f1 = None + f1_report = None + + if val_data is None: + return f1, f1_report + + # Predict on the test set + batch_id = 0 + y_val_pred = [] + y_val_test = [] + for x_val, y_val in val_data: + y_val_pred.append(self._model.predict( + x_val, batch_size=batch_size_test, verbose=verbose_keras)[1]) + y_val_test.append(np.argmax(y_val, axis=-1)) + batch_id += 1 + sys.stdout.flush() + if verbose_log: + sys.stdout.write("\rEPOCH %g, validation_batch_id %d" % + (self._epoch_id, batch_id)) + + tf.keras.backend.set_floatx('float32') + # Clean the predicted entities and the actual entities + f1, f1_report = labeler_utils.evaluate_accuracy( + np.concatenate(y_val_pred, axis=0), + np.concatenate(y_val_test, axis=0), + self.num_labels, + self.reverse_label_mapping, + verbose=verbose_keras) + + return f1, f1_report + + def predict(self, data, batch_size=32, show_confidences=False, + verbose=True): + """ + Run model and get predictions + + :param data: text input + :type data: Union[list, numpy.ndarray] + :param batch_size: number of samples in the batch of data + :type batch_size: int + :param show_confidences: whether user wants prediction confidences + :type show_confidences: + :param verbose: Flag to determine whether to print status or not + :type verbose: bool + :return: char level predictions and confidences + :rtype: dict + """ + if not self._model: + raise ValueError("You are trying to predict without a model. " + "Construct/Load a model before predicting.") + elif self._need_to_reconstruct_model(): + raise RuntimeError("The model label mapping definitions have been " + "altered without additional training. Please " + "train the model or reset the label mapping to " + "predict.") + # Pre-allocate space for predictions + confidences = [] + sentence_lengths = np.zeros((batch_size,), dtype=int) + predictions = np.zeros((batch_size, self._parameters['max_length'])) + if show_confidences: + confidences = np.zeros((batch_size, + self._parameters['max_length'], + self.num_labels)) + + # Run model with batching + allocation_index = 0 + for batch_id, batch_data in enumerate(data): + model_output = self._model( + tf.convert_to_tensor(batch_data) + ) + + # Count number of samples in batch to prevent array mismatch + num_samples_in_batch = len(batch_data) + allocation_index = batch_id * batch_size + + # Double array size + if len(predictions) <= allocation_index: + predictions = np.pad(predictions, ((0, len(predictions)), + (0, 0)), mode='constant') + sentence_lengths = np.pad( + sentence_lengths, pad_width=((0, len(sentence_lengths)),), + mode='constant') + if show_confidences: + confidences = np.pad(confidences, + ((0, len(predictions)), + (0, 0), (0, 0)), mode='constant') + + if show_confidences: + confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy() + predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy() + sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data)) + + allocation_index += num_samples_in_batch + + # Convert predictions, confidences to lists from numpy + predictions_list = [i for i in range(0, allocation_index)] + confidences_list = None + if show_confidences: + confidences_list = [i for i in range(0, allocation_index)] + + # Append slices of predictions to return prediction & confidence matrices + for index, sentence_length \ + in enumerate(sentence_lengths[:allocation_index]): + predictions_list[index] = list(predictions[index][:sentence_length]) + if show_confidences: + confidences_list[index] = list(confidences[index][:sentence_length]) + + if show_confidences: + return {'pred': predictions_list, 'conf': confidences_list} + return {'pred': predictions_list} + + def details(self): + """ + Prints the relevant details of the model (summary, parameters, label + mapping) + """ + print("\n###### Model Details ######\n") + self._model.summary() + print("\nModel Parameters:") + for key, value in self._parameters.items(): + print("{}: {}".format(key, value)) + print("\nModel Label Mapping:") + for key, value in self.label_mapping.items(): + print("{}: {}".format(key, value)) From 391797f966e5e31bbb98dcb6fc7d9bacfd988bf0 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 5 Apr 2022 16:48:02 -0500 Subject: [PATCH 02/14] feat: load any tf model --- .../labelers/char_load_tf_trainable.py | 762 ++++++++++++++++++ 1 file changed, 762 insertions(+) create mode 100644 dataprofiler/labelers/char_load_tf_trainable.py diff --git a/dataprofiler/labelers/char_load_tf_trainable.py b/dataprofiler/labelers/char_load_tf_trainable.py new file mode 100644 index 000000000..492b4324f --- /dev/null +++ b/dataprofiler/labelers/char_load_tf_trainable.py @@ -0,0 +1,762 @@ +import json +import copy +import os +import sys +import time +import logging +from collections import defaultdict +import functools + +import tensorflow as tf +import numpy as np +from sklearn import decomposition + +from . import labeler_utils +from .base_model import BaseModel, BaseTrainableModel +from .base_model import AutoSubRegistrationMeta +from .. import dp_logging + +_file_dir = os.path.dirname(os.path.abspath(__file__)) + +logger = dp_logging.get_child_logger(__name__) + + +class NoV1ResourceMessageFilter(logging.Filter): + """Removes TF2 warning for using TF1 model which has resources.""" + def filter(self, record): + msg = 'is a problem, consider rebuilding the SavedModel after ' + \ + 'running tf.compat.v1.enable_resource_variables()' + return msg not in record.getMessage() + + +tf_logger = logging.getLogger('tensorflow') +tf_logger.addFilter(NoV1ResourceMessageFilter()) + + +def protected_register_keras_serializable(package='Custom', name=None): + """ + Protects against already registered keras serializable layers. This + ensures that if it was already registered, it will not try to register it + again. + """ + def decorator(arg): + """Protects against double registration of a keras layer.""" + class_name = name if name is not None else arg.__name__ + registered_name = package + '>' + class_name + if tf.keras.utils.get_registered_object(registered_name) is None: + tf.keras.utils.register_keras_serializable(package, name)(arg) + return arg + return decorator + + +@protected_register_keras_serializable() +class FBetaScore(tf.keras.metrics.Metric): + r"""Computes F-Beta score. + Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the weighted harmonic mean of precision + and recall. Output range is `[0, 1]`. Works for + both multi-class and multi-label classification. + $$ + F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` and + `weighted`. Default value is None. + beta: Determines the weight of precision and recall + in harmonic mean. Determines the weight given to the + precision and recall. Default value is 1. + threshold: Elements of `y_pred` greater than threshold are + converted to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-Beta Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, beta=1.0, threshold=None, + name="fbeta_score", dtype=None, **kwargs): + super().__init__(name=name, dtype=dtype) + + if average not in (None, "micro", "macro", "weighted"): + raise ValueError( + "Unknown average type. Acceptable values " + "are: [None, 'micro', 'macro', 'weighted']" + ) + + if not isinstance(beta, float): + raise TypeError("The value of beta should be a python float") + + if beta <= 0.0: + raise ValueError("beta value should be greater than zero") + + if threshold is not None: + if not isinstance(threshold, float): + raise TypeError("The value of threshold should be a python float") + if threshold > 1.0 or threshold <= 0.0: + raise ValueError("threshold should be between 0 and 1") + + self.num_classes = num_classes + self.average = average + self.beta = beta + self.threshold = threshold + self.axis = None + self.init_shape = [] + + if self.average != "micro": + self.axis = 0 + self.init_shape = [self.num_classes] + + def _zero_wt_init(name): + return self.add_weight( + name, shape=self.init_shape, initializer="zeros", dtype=self.dtype + ) + + self.true_positives = _zero_wt_init("true_positives") + self.false_positives = _zero_wt_init("false_positives") + self.false_negatives = _zero_wt_init("false_negatives") + self.weights_intermediate = _zero_wt_init("weights_intermediate") + + def update_state(self, y_true, y_pred, sample_weight=None): + if self.threshold is None: + threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) + # make sure [0, 0, 0] doesn't become [1, 1, 1] + # Use abs(x) > eps, instead of x != 0 to check for zero + y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) + else: + y_pred = y_pred > self.threshold + + y_true = tf.cast(y_true, self.dtype) + y_pred = tf.cast(y_pred, self.dtype) + + def _weighted_sum(val, sample_weight): + if sample_weight is not None: + val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) + return tf.reduce_sum(val, axis=self.axis) + + self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) + self.false_positives.assign_add( + _weighted_sum(y_pred * (1 - y_true), sample_weight) + ) + self.false_negatives.assign_add( + _weighted_sum((1 - y_pred) * y_true, sample_weight) + ) + self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) + + def result(self): + precision = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_positives + ) + recall = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_negatives + ) + + mul_value = precision * recall + add_value = (tf.math.square(self.beta) * precision) + recall + mean = tf.math.divide_no_nan(mul_value, add_value) + f1_score = mean * (1 + tf.math.square(self.beta)) + + if self.average == "weighted": + weights = tf.math.divide_no_nan( + self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) + ) + f1_score = tf.reduce_sum(f1_score * weights) + + elif self.average is not None: # [micro, macro] + f1_score = tf.reduce_mean(f1_score) + + return f1_score + + def get_config(self): + """Returns the serializable config of the metric.""" + + config = { + "num_classes": self.num_classes, + "average": self.average, + "beta": self.beta, + "threshold": self.threshold, + } + + base_config = super().get_config() + return {**base_config, **config} + + def reset_states(self): + reset_value = tf.zeros(self.init_shape, dtype=self.dtype) + tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) + + +@protected_register_keras_serializable() +class F1Score(FBetaScore): + r"""Computes F-1 Score. + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the harmonic mean of precision and recall. + Output range is `[0, 1]`. Works for both multi-class + and multi-label classification. + $$ + F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` + and `weighted`. Default value is None. + threshold: Elements of `y_pred` above threshold are + considered to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-1 Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, threshold=None, + name="f1_score", dtype=None): + super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) + + def get_config(self): + base_config = super().get_config() + del base_config["beta"] + return base_config + + +class CharLoadTFCnnModel(BaseTrainableModel, + metaclass=AutoSubRegistrationMeta): + + # boolean if the label mapping requires the mapping for index 0 reserved + requires_zero_mapping = False + + def __init__(self, original_model_path, label_mapping=None, + parameters=None): + """ + CNN Model Initializer. initialize epoch_id + + :param label_mapping: maps labels to their encoded integers + :type label_mapping: dict + :param parameters: Contains all the appropriate parameters for the + model. Must contain num_labels. Other possible parameters are: + max_length, max_char_encoding_id, dim_embed, size_fc + dropout, size_conv, num_fil, optimizer, default_label + :type parameters: dict + :return: None + """ + + # parameter initialization + if not parameters: + parameters = {} + parameters.setdefault('default_label', "UNKNOWN") + parameters['original_model_path'] = original_model_path + parameters['pad_label'] = 'PAD' + self._epoch_id = 0 + + # reconstruct flags for model + self._model_num_labels = 0 + self._model_default_ind = -1 + + BaseModel.__init__(self, label_mapping, parameters) + + def __eq__(self, other): + """ + Checks if two models are equal with one another, may only check + important variables, i.e. may not check model itself. + + :param self: a model + :param other: a model + :type self: BaseModel + :type other: BaseModel + :return: Whether or not self and other are equal + :rtype: bool + """ + if self._parameters != other._parameters \ + or self._label_mapping != other._label_mapping: + return False + return True + + def _validate_parameters(self, parameters): + """ + Validate the parameters sent in. Raise error if invalid parameters are + present. + + :param parameters: parameter dict containing the following parameters: + max_length: Maximum char length in a sample + max_char_encoding_id: Maximum integer value for encoding the input + dim_embed: Number of embedded dimensions + size_fc: Size of each fully connected layers + dropout: Ratio of dropout in the model + size_conv: Convolution kernel size + default_label: Key for label_mapping that is the default label + pad_label: Key for entities_dict that is the pad label + num_fil: Number of filters in each convolution layer + :type parameters: dict + :return: None + """ + errors = [] + list_of_necessary_params = ['original_model_path', 'default_label', + 'pad_label'] + + # Make sure the necessary parameters are present and valid. + for param in parameters: + if param in ['default_label', 'original_model_path', 'pad_label']: + if not isinstance(parameters[param], str): + error = str(param) + " must be a string." + errors.append(error) + + # Error if there are extra parameters thrown in + for param in parameters: + if param not in list_of_necessary_params: + errors.append(param + " is not an accepted parameter.") + if errors: + raise ValueError('\n'.join(errors)) + + def set_label_mapping(self, label_mapping): + """ + Sets the labels for the model + + :param label_mapping: label mapping of the model + :type label_mapping: dict + :return: None + """ + if not isinstance(label_mapping, (list, dict)): + raise TypeError("Labels must either be a non-empty encoding dict " + "which maps labels to index encodings or a list.") + + label_mapping = copy.deepcopy(label_mapping) + if 'PAD' not in label_mapping: + if isinstance(label_mapping, list): # if list missing PAD + label_mapping = ['PAD'] + label_mapping + elif 0 not in label_mapping.values(): # if dict missing PAD and 0 + label_mapping.update({'PAD': 0}) + else: + label_mapping.update( + {'PAD': max(list(label_mapping.values())) + 1}) + if self._parameters['default_label'] not in label_mapping: + raise ValueError("The `default_label` of {} must exist in the " + "label mapping.".format( + self._parameters['default_label'])) + super().set_label_mapping(label_mapping) + + def _need_to_reconstruct_model(self): + """ + Determines whether or not the model needs to be reconstructed. + + :return: bool of whether or not the model needs to reconstruct. + """ + if not self._model: + return False + default_ind = self.label_mapping[self._parameters['default_label']] + return self.num_labels != self._model_num_labels or \ + default_ind != self._model_default_ind + + def save_to_disk(self, dirpath): + """ + Saves whole model to disk with weights + + :param dirpath: directory path where you want to save the model to + :type dirpath: str + :return: None + """ + if not self._model: + self._construct_model() + elif self._need_to_reconstruct_model(): + self._reconstruct_model() + + model_param_dirpath = os.path.join(dirpath, "model_parameters.json") + with open(model_param_dirpath, 'w') as fp: + json.dump(self._parameters, fp) + labels_dirpath = os.path.join(dirpath, "label_mapping.json") + with open(labels_dirpath, 'w') as fp: + json.dump(self.label_mapping, fp) + self._model.save(os.path.join(dirpath)) + + @classmethod + def load_from_disk(cls, dirpath): + """ + Loads whole model from disk with weights + + :param dirpath: directory path where you want to load the model from + :type dirpath: str + :return: None + """ + + # load parameters + model_param_dirpath = os.path.join(dirpath, "model_parameters.json") + with open(model_param_dirpath, 'r') as fp: + parameters = json.load(fp) + + # load label_mapping + labels_dirpath = os.path.join(dirpath, "label_mapping.json") + with open(labels_dirpath, 'r') as fp: + label_mapping = json.load(fp) + + # use f1 score metric + custom_objects = { + "F1Score": F1Score( + num_classes=max(label_mapping.values()) + 1, + average='micro'), + "CharacterLevelCnnModel": cls, + } + with tf.keras.utils.custom_object_scope(custom_objects): + tf_model = tf.keras.models.load_model(dirpath) + + loaded_model = cls(label_mapping, parameters) + loaded_model._model = tf_model + + # load self + loaded_model._model_num_labels = loaded_model.num_labels + loaded_model._model_default_ind = loaded_model.label_mapping[ + loaded_model._parameters['default_label'] + ] + return loaded_model + + def _get_layer_index(self, layer_name): + for idx, layer in enumerate(self._model.layers): + if layer.name == layer_name: + return idx + + def _construct_model(self): + """ + Model constructor for the data labeler. This also serves as a weight + reset. + + :return: None + """ + num_labels = self.num_labels + default_ind = self.label_mapping[self._parameters['default_label']] + model_loc = self._parameters['original_model_path'] + + self._model = tf.keras.models.load_model(model_loc) + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + softmax_layer_ind = self._get_layer_index(softmax_output_layer_name) + softmax_layer = self._model.get_layer(softmax_output_layer_name) + prev_softmax_layer = softmax_layer.input + + new_softmax_layer = softmax_layer.output + if softmax_layer.weights[0].shape[-1] != num_labels: + new_softmax_layer = tf.keras.layers.Dense( + num_labels, activation='softmax', name="softmax_output")( + self._model.layers[softmax_layer_ind - 1].output) + + # Output the model into a .pb file for TensorFlow + argmax_layer = tf.keras.backend.argmax(new_softmax_layer) + + + argmax_outputs = [new_softmax_layer, argmax_layer] + self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + + # Compile the model w/ metrics + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + losses = {softmax_output_layer_name: "categorical_crossentropy"} + + # use f1 score metric + f1_score_training = F1Score(num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + + self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + + self._epoch_id = 0 + self._model_num_labels = num_labels + self._model_default_ind = default_ind + + def reset_weights(self): + """ + Reset the weights of the model. + + :return: None + """ + self._construct_model() + + def _reconstruct_model(self): + """ + Reconstruct the appropriate layers if the number of number of labels is + altered + + :return: None + """ + + # Reset model + tf.keras.backend.clear_session() + + num_labels = self.num_labels + default_ind = self.label_mapping[self._parameters['default_label']] + + # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax') + for _ in range(2): + self._model.layers.pop() + + # Add the final Softmax layer to the previous spot + final_softmax_layer = tf.keras.layers.Dense( + num_labels, activation='softmax', name="softmax_output")( + self._model.layers[-4].output) + + # Output the model into a .pb file for TensorFlow + argmax_layer = tf.keras.backend.argmax(final_softmax_layer) + + + argmax_outputs = [final_softmax_layer, argmax_layer] + self._model = tf.keras.Model(self._model.inputs, argmax_outputs) + + # Compile the model + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + losses = {softmax_output_layer_name: "categorical_crossentropy"} + + # use f1 score metric + f1_score_training = F1Score(num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + + self._model.compile(loss=losses, optimizer="adam", metrics=metrics) + + self._epoch_id = 0 + self._model_num_labels = num_labels + self._model_default_ind = default_ind + + def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, + reset_weights=False, verbose=True): + """ + Train the current model with the training data and validation data + + :param train_data: Training data used to train model + :type train_data: Union[list, np.ndarray] + :param val_data: Validation data used to validate the training + :type val_data: Union[list, np.ndarray] + :param batch_size: Used to determine number of samples in each batch + :type batch_size: int + :param label_mapping: maps labels to their encoded integers + :type label_mapping: Union[dict, None] + :param reset_weights: Flag to determine whether to reset the weights or + not + :type reset_weights: bool + :param verbose: Flag to determine whether to print status or not + :type verbose: bool + :return: None + """ + + if label_mapping is not None: + self.set_label_mapping(label_mapping) + + if not self._model: + self._construct_model() + else: + if self._need_to_reconstruct_model(): + self._reconstruct_model() + if reset_weights: + self.reset_weights() + + history = defaultdict() + f1 = None + f1_report = [] + + self._model.reset_metrics() + softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + + start_time = time.time() + batch_id = 0 + for x_train, y_train in train_data: + model_results = self._model.train_on_batch( + x_train, {softmax_output_layer_name: y_train}) + sys.stdout.flush() + if verbose: + sys.stdout.write( + "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " + "f1_score %f" % + (self._epoch_id, batch_id, *model_results[1:])) + batch_id += 1 + + for i, metric_label in enumerate(self._model.metrics_names): + history[metric_label] = model_results[i] + + if val_data: + f1, f1_report = self._validate_training(val_data) + history['f1_report'] = f1_report + + val_f1 = f1_report['weighted avg']['f1-score'] \ + if f1_report else np.NAN + val_precision = f1_report['weighted avg']['precision'] \ + if f1_report else np.NAN + val_recall = f1_report['weighted avg']['recall'] \ + if f1_report else np.NAN + epoch_time = time.time() - start_time + logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " + "val_f1: %f - val_precision: %f - val_recall %f" % + (self._epoch_id, epoch_time, *model_results[1:], + val_f1, val_precision, val_recall)) + + self._epoch_id += 1 + + return history, f1, f1_report + + def _validate_training(self, val_data, batch_size_test=32, + verbose_log=True, verbose_keras=False): + """ + Validate the model on the test set and return the evaluation metrics. + + :param val_data: data generator for the validation + :type val_data: iterator + :param batch_size_test: Number of samples to process in testing + :type batch_size_test: int + :param verbose_log: whether or not to print out scores for training, + etc. + :type verbose_log: bool + :param verbose_keras: whether or not to print out scores for training, + from keras. + :type verbose_keras: bool + return (f1-score, f1 report). + """ + f1 = None + f1_report = None + + if val_data is None: + return f1, f1_report + + # Predict on the test set + batch_id = 0 + y_val_pred = [] + y_val_test = [] + for x_val, y_val in val_data: + y_val_pred.append(self._model.predict( + x_val, batch_size=batch_size_test, verbose=verbose_keras)[1]) + y_val_test.append(np.argmax(y_val, axis=-1)) + batch_id += 1 + sys.stdout.flush() + if verbose_log: + sys.stdout.write("\rEPOCH %g, validation_batch_id %d" % + (self._epoch_id, batch_id)) + + tf.keras.backend.set_floatx('float32') + # Clean the predicted entities and the actual entities + f1, f1_report = labeler_utils.evaluate_accuracy( + np.concatenate(y_val_pred, axis=0), + np.concatenate(y_val_test, axis=0), + self.num_labels, + self.reverse_label_mapping, + verbose=verbose_keras) + + return f1, f1_report + + def predict(self, data, batch_size=32, show_confidences=False, + verbose=True): + """ + Run model and get predictions + + :param data: text input + :type data: Union[list, numpy.ndarray] + :param batch_size: number of samples in the batch of data + :type batch_size: int + :param show_confidences: whether user wants prediction confidences + :type show_confidences: + :param verbose: Flag to determine whether to print status or not + :type verbose: bool + :return: char level predictions and confidences + :rtype: dict + """ + if not self._model: + raise ValueError("You are trying to predict without a model. " + "Construct/Load a model before predicting.") + elif self._need_to_reconstruct_model(): + raise RuntimeError("The model label mapping definitions have been " + "altered without additional training. Please " + "train the model or reset the label mapping to " + "predict.") + # Pre-allocate space for predictions + confidences = [] + # sentence_lengths = np.zeros((batch_size,), dtype=int) + # predictions = np.zeros((batch_size, self._parameters['max_length'])) + predictions = [] + # if show_confidences: + # confidences = np.zeros((batch_size, + # self._parameters['max_length'], + # self.num_labels)) + + # Run model with batching + allocation_index = 0 + for batch_id, batch_data in enumerate(data): + model_output = self._model( + tf.convert_to_tensor(batch_data) + ) + + # Count number of samples in batch to prevent array mismatch + num_samples_in_batch = len(batch_data) + allocation_index = batch_id * batch_size + + # Double array size + if len(predictions) <= allocation_index: + predictions += predictions + # sentence_lengths = np.pad( + # sentence_lengths, pad_width=((0, len(sentence_lengths)),), + # mode='constant') + if show_confidences: + confidences += confidences + + if show_confidences: + confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy() + predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy() + # sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data)) + + allocation_index += num_samples_in_batch + + # Convert predictions, confidences to lists from numpy + predictions = [predictions[i].tolist() for i in range(0, allocation_index)] + confidences_list = None + if show_confidences: + confidences = [confidences[i].tolist() + for i in range(0, allocation_index)] + + # # Append slices of predictions to return prediction & confidence matrices + # for index, sentence_length \ + # in enumerate(sentence_lengths[:allocation_index]): + # predictions_list[index] = list(predictions[index][:sentence_length]) + # if show_confidences: + # confidences_list[index] = list(confidences[index][:sentence_length]) + + if show_confidences: + return {'pred': predictions, 'conf': confidences} + return {'pred': predictions} + + def details(self): + """ + Prints the relevant details of the model (summary, parameters, label + mapping) + """ + print("\n###### Model Details ######\n") + self._model.summary() + print("\nModel Parameters:") + for key, value in self._parameters.items(): + print("{}: {}".format(key, value)) + print("\nModel Label Mapping:") + for key, value in self.label_mapping.items(): + print("{}: {}".format(key, value)) From e66e0b48cb31a306c554a8820651ae5dd967602d Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 4 Jul 2022 11:21:12 -0500 Subject: [PATCH 03/14] refactor: for new model --- ..._tf_trainable.py => char_load_tf_model.py} | 270 ++---------------- .../labelers/character_level_cnn_model.py | 264 +---------------- dataprofiler/labelers/labeler_utils.py | 254 ++++++++++++++++ 3 files changed, 282 insertions(+), 506 deletions(-) rename dataprofiler/labelers/{char_load_tf_trainable.py => char_load_tf_model.py} (64%) diff --git a/dataprofiler/labelers/char_load_tf_trainable.py b/dataprofiler/labelers/char_load_tf_model.py similarity index 64% rename from dataprofiler/labelers/char_load_tf_trainable.py rename to dataprofiler/labelers/char_load_tf_model.py index 492b4324f..568b335f7 100644 --- a/dataprofiler/labelers/char_load_tf_trainable.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -19,254 +19,22 @@ _file_dir = os.path.dirname(os.path.abspath(__file__)) logger = dp_logging.get_child_logger(__name__) +labeler_utils.hide_tf_logger_warnings() -class NoV1ResourceMessageFilter(logging.Filter): - """Removes TF2 warning for using TF1 model which has resources.""" - def filter(self, record): - msg = 'is a problem, consider rebuilding the SavedModel after ' + \ - 'running tf.compat.v1.enable_resource_variables()' - return msg not in record.getMessage() - - -tf_logger = logging.getLogger('tensorflow') -tf_logger.addFilter(NoV1ResourceMessageFilter()) - - -def protected_register_keras_serializable(package='Custom', name=None): - """ - Protects against already registered keras serializable layers. This - ensures that if it was already registered, it will not try to register it - again. - """ - def decorator(arg): - """Protects against double registration of a keras layer.""" - class_name = name if name is not None else arg.__name__ - registered_name = package + '>' + class_name - if tf.keras.utils.get_registered_object(registered_name) is None: - tf.keras.utils.register_keras_serializable(package, name)(arg) - return arg - return decorator - - -@protected_register_keras_serializable() -class FBetaScore(tf.keras.metrics.Metric): - r"""Computes F-Beta score. - Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the weighted harmonic mean of precision - and recall. Output range is `[0, 1]`. Works for - both multi-class and multi-label classification. - $$ - F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` and - `weighted`. Default value is None. - beta: Determines the weight of precision and recall - in harmonic mean. Determines the weight given to the - precision and recall. Default value is 1. - threshold: Elements of `y_pred` greater than threshold are - converted to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-Beta Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__(self, num_classes, average=None, beta=1.0, threshold=None, - name="fbeta_score", dtype=None, **kwargs): - super().__init__(name=name, dtype=dtype) - - if average not in (None, "micro", "macro", "weighted"): - raise ValueError( - "Unknown average type. Acceptable values " - "are: [None, 'micro', 'macro', 'weighted']" - ) - - if not isinstance(beta, float): - raise TypeError("The value of beta should be a python float") - - if beta <= 0.0: - raise ValueError("beta value should be greater than zero") - - if threshold is not None: - if not isinstance(threshold, float): - raise TypeError("The value of threshold should be a python float") - if threshold > 1.0 or threshold <= 0.0: - raise ValueError("threshold should be between 0 and 1") - - self.num_classes = num_classes - self.average = average - self.beta = beta - self.threshold = threshold - self.axis = None - self.init_shape = [] - - if self.average != "micro": - self.axis = 0 - self.init_shape = [self.num_classes] - - def _zero_wt_init(name): - return self.add_weight( - name, shape=self.init_shape, initializer="zeros", dtype=self.dtype - ) - - self.true_positives = _zero_wt_init("true_positives") - self.false_positives = _zero_wt_init("false_positives") - self.false_negatives = _zero_wt_init("false_negatives") - self.weights_intermediate = _zero_wt_init("weights_intermediate") - - def update_state(self, y_true, y_pred, sample_weight=None): - if self.threshold is None: - threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) - # make sure [0, 0, 0] doesn't become [1, 1, 1] - # Use abs(x) > eps, instead of x != 0 to check for zero - y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) - else: - y_pred = y_pred > self.threshold - - y_true = tf.cast(y_true, self.dtype) - y_pred = tf.cast(y_pred, self.dtype) - - def _weighted_sum(val, sample_weight): - if sample_weight is not None: - val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) - return tf.reduce_sum(val, axis=self.axis) - - self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) - self.false_positives.assign_add( - _weighted_sum(y_pred * (1 - y_true), sample_weight) - ) - self.false_negatives.assign_add( - _weighted_sum((1 - y_pred) * y_true, sample_weight) - ) - self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) - - def result(self): - precision = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_positives - ) - recall = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_negatives - ) - - mul_value = precision * recall - add_value = (tf.math.square(self.beta) * precision) + recall - mean = tf.math.divide_no_nan(mul_value, add_value) - f1_score = mean * (1 + tf.math.square(self.beta)) - - if self.average == "weighted": - weights = tf.math.divide_no_nan( - self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) - ) - f1_score = tf.reduce_sum(f1_score * weights) - - elif self.average is not None: # [micro, macro] - f1_score = tf.reduce_mean(f1_score) - - return f1_score - - def get_config(self): - """Returns the serializable config of the metric.""" - - config = { - "num_classes": self.num_classes, - "average": self.average, - "beta": self.beta, - "threshold": self.threshold, - } - - base_config = super().get_config() - return {**base_config, **config} - - def reset_states(self): - reset_value = tf.zeros(self.init_shape, dtype=self.dtype) - tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) - - -@protected_register_keras_serializable() -class F1Score(FBetaScore): - r"""Computes F-1 Score. - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the harmonic mean of precision and recall. - Output range is `[0, 1]`. Works for both multi-class - and multi-label classification. - $$ - F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` - and `weighted`. Default value is None. - threshold: Elements of `y_pred` above threshold are - considered to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-1 Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__(self, num_classes, average=None, threshold=None, - name="f1_score", dtype=None): - super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) - - def get_config(self): - base_config = super().get_config() - del base_config["beta"] - return base_config - - -class CharLoadTFCnnModel(BaseTrainableModel, - metaclass=AutoSubRegistrationMeta): +class CharLoadTFModel(BaseTrainableModel, + metaclass=AutoSubRegistrationMeta): # boolean if the label mapping requires the mapping for index 0 reserved requires_zero_mapping = False - def __init__(self, original_model_path, label_mapping=None, + def __init__(self, model_path, label_mapping=None, parameters=None): """ - CNN Model Initializer. initialize epoch_id + Loadable TF Model Initializer. + :param model_path: path to model to load + :type model_path: str :param label_mapping: maps labels to their encoded integers :type label_mapping: dict :param parameters: Contains all the appropriate parameters for the @@ -281,7 +49,7 @@ def __init__(self, original_model_path, label_mapping=None, if not parameters: parameters = {} parameters.setdefault('default_label', "UNKNOWN") - parameters['original_model_path'] = original_model_path + parameters['model_path'] = model_path parameters['pad_label'] = 'PAD' self._epoch_id = 0 @@ -327,12 +95,12 @@ def _validate_parameters(self, parameters): :return: None """ errors = [] - list_of_necessary_params = ['original_model_path', 'default_label', + list_of_necessary_params = ['model_path', 'default_label', 'pad_label'] # Make sure the necessary parameters are present and valid. for param in parameters: - if param in ['default_label', 'original_model_path', 'pad_label']: + if param in ['default_label', 'model_path', 'pad_label']: if not isinstance(parameters[param], str): error = str(param) + " must be a string." errors.append(error) @@ -426,7 +194,7 @@ def load_from_disk(cls, dirpath): # use f1 score metric custom_objects = { - "F1Score": F1Score( + "F1Score": labeler_utils.F1Score( num_classes=max(label_mapping.values()) + 1, average='micro'), "CharacterLevelCnnModel": cls, @@ -444,11 +212,6 @@ def load_from_disk(cls, dirpath): ] return loaded_model - def _get_layer_index(self, layer_name): - for idx, layer in enumerate(self._model.layers): - if layer.name == layer_name: - return idx - def _construct_model(self): """ Model constructor for the data labeler. This also serves as a weight @@ -458,11 +221,12 @@ def _construct_model(self): """ num_labels = self.num_labels default_ind = self.label_mapping[self._parameters['default_label']] - model_loc = self._parameters['original_model_path'] + model_loc = self._parameters['model_path'] self._model = tf.keras.models.load_model(model_loc) softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] - softmax_layer_ind = self._get_layer_index(softmax_output_layer_name) + softmax_layer_ind = labeler_utils.get_tf_layer_index_from_name( + self._model, softmax_output_layer_name) softmax_layer = self._model.get_layer(softmax_output_layer_name) prev_softmax_layer = softmax_layer.input @@ -484,7 +248,8 @@ def _construct_model(self): losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average='micro') + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average='micro') metrics = {softmax_output_layer_name: ['acc', f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -536,7 +301,8 @@ def _reconstruct_model(self): losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average='micro') + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average='micro') metrics = {softmax_output_layer_name: ['acc', f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index 4a9e84311..c02fae1b9 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -18,253 +18,7 @@ _file_dir = os.path.dirname(os.path.abspath(__file__)) logger = dp_logging.get_child_logger(__name__) - - -class NoV1ResourceMessageFilter(logging.Filter): - """Removes TF2 warning for using TF1 model which has resources.""" - - def filter(self, record): - msg = ( - "is a problem, consider rebuilding the SavedModel after " - + "running tf.compat.v1.enable_resource_variables()" - ) - return msg not in record.getMessage() - - -tf_logger = logging.getLogger("tensorflow") -tf_logger.addFilter(NoV1ResourceMessageFilter()) - - -def protected_register_keras_serializable(package='Custom', name=None): - """ - Protects against already registered keras serializable layers. This - ensures that if it was already registered, it will not try to register it - again. - """ - def decorator(arg): - """Protects against double registration of a keras layer.""" - class_name = name if name is not None else arg.__name__ - registered_name = package + '>' + class_name - if tf.keras.utils.get_registered_object(registered_name) is None: - tf.keras.utils.register_keras_serializable(package, name)(arg) - return arg - return decorator - - -@protected_register_keras_serializable() -class FBetaScore(tf.keras.metrics.Metric): - r"""Computes F-Beta score. - Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the weighted harmonic mean of precision - and recall. Output range is `[0, 1]`. Works for - both multi-class and multi-label classification. - $$ - F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` and - `weighted`. Default value is None. - beta: Determines the weight of precision and recall - in harmonic mean. Determines the weight given to the - precision and recall. Default value is 1. - threshold: Elements of `y_pred` greater than threshold are - converted to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-Beta Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__( - self, - num_classes, - average=None, - beta=1.0, - threshold=None, - name="fbeta_score", - dtype=None, - **kwargs - ): - super().__init__(name=name, dtype=dtype) - - if average not in (None, "micro", "macro", "weighted"): - raise ValueError( - "Unknown average type. Acceptable values " - "are: [None, 'micro', 'macro', 'weighted']" - ) - - if not isinstance(beta, float): - raise TypeError("The value of beta should be a python float") - - if beta <= 0.0: - raise ValueError("beta value should be greater than zero") - - if threshold is not None: - if not isinstance(threshold, float): - raise TypeError("The value of threshold should be a python float") - if threshold > 1.0 or threshold <= 0.0: - raise ValueError("threshold should be between 0 and 1") - - self.num_classes = num_classes - self.average = average - self.beta = beta - self.threshold = threshold - self.axis = None - self.init_shape = [] - - if self.average != "micro": - self.axis = 0 - self.init_shape = [self.num_classes] - - def _zero_wt_init(name): - return self.add_weight( - name, shape=self.init_shape, initializer="zeros", dtype=self.dtype - ) - - self.true_positives = _zero_wt_init("true_positives") - self.false_positives = _zero_wt_init("false_positives") - self.false_negatives = _zero_wt_init("false_negatives") - self.weights_intermediate = _zero_wt_init("weights_intermediate") - - def update_state(self, y_true, y_pred, sample_weight=None): - if self.threshold is None: - threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) - # make sure [0, 0, 0] doesn't become [1, 1, 1] - # Use abs(x) > eps, instead of x != 0 to check for zero - y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) - else: - y_pred = y_pred > self.threshold - - y_true = tf.cast(y_true, self.dtype) - y_pred = tf.cast(y_pred, self.dtype) - - def _weighted_sum(val, sample_weight): - if sample_weight is not None: - val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) - return tf.reduce_sum(val, axis=self.axis) - - self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) - self.false_positives.assign_add( - _weighted_sum(y_pred * (1 - y_true), sample_weight) - ) - self.false_negatives.assign_add( - _weighted_sum((1 - y_pred) * y_true, sample_weight) - ) - self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) - - def result(self): - precision = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_positives - ) - recall = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_negatives - ) - - mul_value = precision * recall - add_value = (tf.math.square(self.beta) * precision) + recall - mean = tf.math.divide_no_nan(mul_value, add_value) - f1_score = mean * (1 + tf.math.square(self.beta)) - - if self.average == "weighted": - weights = tf.math.divide_no_nan( - self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) - ) - f1_score = tf.reduce_sum(f1_score * weights) - - elif self.average is not None: # [micro, macro] - f1_score = tf.reduce_mean(f1_score) - - return f1_score - - def get_config(self): - """Returns the serializable config of the metric.""" - - config = { - "num_classes": self.num_classes, - "average": self.average, - "beta": self.beta, - "threshold": self.threshold, - } - - base_config = super().get_config() - return {**base_config, **config} - - def reset_states(self): - reset_value = tf.zeros(self.init_shape, dtype=self.dtype) - tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) - - -@protected_register_keras_serializable() -class F1Score(FBetaScore): - r"""Computes F-1 Score. - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the harmonic mean of precision and recall. - Output range is `[0, 1]`. Works for both multi-class - and multi-label classification. - $$ - F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` - and `weighted`. Default value is None. - threshold: Elements of `y_pred` above threshold are - considered to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-1 Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__( - self, num_classes, average=None, threshold=None, name="f1_score", dtype=None - ): - super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) - - def get_config(self): - base_config = super().get_config() - del base_config["beta"] - return base_config +labeler_utils.hide_tf_logger_warnings() def build_embd_dictionary(filename): @@ -539,9 +293,9 @@ def load_from_disk(cls, dirpath): # use f1 score metric custom_objects = { - "F1Score": F1Score( - num_classes=max(label_mapping.values()) + 1, average="micro" - ), + "F1Score": labeler_utils.F1Score( + num_classes=max(label_mapping.values()) + 1, + average='micro'), "CharacterLevelCnnModel": cls, } with tf.keras.utils.custom_object_scope(custom_objects): @@ -761,8 +515,9 @@ def encoding_function(input_str): losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average="micro") - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -821,8 +576,9 @@ def _reconstruct_model(self): losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average="micro") - metrics = {softmax_output_layer_name: ["acc", f1_score_training]} + f1_score_training = labeler_utils.F1Score( + num_classes=num_labels, average='micro') + metrics = {softmax_output_layer_name: ['acc', f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) self._epoch_id = 0 diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py index 4bf92fb02..679233e39 100644 --- a/dataprofiler/labelers/labeler_utils.py +++ b/dataprofiler/labelers/labeler_utils.py @@ -1,9 +1,11 @@ import os import warnings +import logging import numpy as np import scipy from sklearn.exceptions import UndefinedMetricWarning +import tensorflow as tf from .. import dp_logging from .classification_report_utils import classification_report @@ -196,3 +198,255 @@ def evaluate_accuracy( logger.info(f"F1 Score: {f1}") return f1, f1_report + + +def get_tf_layer_index_from_name(model, layer_name): + """ + Returns the index of the layer given the layer name within a tf model + + :param model: tf keras model to search + :param layer_name: name of the layer to find + :return: layer index if it exists or None + """ + for idx, layer in enumerate(model.layers): + if layer.name == layer_name: + return idx + + +def hide_tf_logger_warnings(): + """ + Filters out a set of warnings from the tf logger. + """ + class NoV1ResourceMessageFilter(logging.Filter): + """Removes TF2 warning for using TF1 model which has resources.""" + def filter(self, record): + msg = 'is a problem, consider rebuilding the SavedModel after ' + \ + 'running tf.compat.v1.enable_resource_variables()' + return msg not in record.getMessage() + + + tf_logger = logging.getLogger('tensorflow') + tf_logger.addFilter(NoV1ResourceMessageFilter()) + + +def protected_register_keras_serializable(package='Custom', name=None): + """ + Protects against already registered keras serializable layers. This + ensures that if it was already registered, it will not try to register it + again. + """ + def decorator(arg): + """Protects against double registration of a keras layer.""" + class_name = name if name is not None else arg.__name__ + registered_name = package + '>' + class_name + if tf.keras.utils.get_registered_object(registered_name) is None: + tf.keras.utils.register_keras_serializable(package, name)(arg) + return arg + return decorator + + +@protected_register_keras_serializable() +class FBetaScore(tf.keras.metrics.Metric): + r"""Computes F-Beta score. + Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the weighted harmonic mean of precision + and recall. Output range is `[0, 1]`. Works for + both multi-class and multi-label classification. + $$ + F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` and + `weighted`. Default value is None. + beta: Determines the weight of precision and recall + in harmonic mean. Determines the weight given to the + precision and recall. Default value is 1. + threshold: Elements of `y_pred` greater than threshold are + converted to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-Beta Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, beta=1.0, threshold=None, + name="fbeta_score", dtype=None, **kwargs): + super().__init__(name=name, dtype=dtype) + + if average not in (None, "micro", "macro", "weighted"): + raise ValueError( + "Unknown average type. Acceptable values " + "are: [None, 'micro', 'macro', 'weighted']" + ) + + if not isinstance(beta, float): + raise TypeError("The value of beta should be a python float") + + if beta <= 0.0: + raise ValueError("beta value should be greater than zero") + + if threshold is not None: + if not isinstance(threshold, float): + raise TypeError("The value of threshold should be a python float") + if threshold > 1.0 or threshold <= 0.0: + raise ValueError("threshold should be between 0 and 1") + + self.num_classes = num_classes + self.average = average + self.beta = beta + self.threshold = threshold + self.axis = None + self.init_shape = [] + + if self.average != "micro": + self.axis = 0 + self.init_shape = [self.num_classes] + + def _zero_wt_init(name): + return self.add_weight( + name, shape=self.init_shape, initializer="zeros", dtype=self.dtype + ) + + self.true_positives = _zero_wt_init("true_positives") + self.false_positives = _zero_wt_init("false_positives") + self.false_negatives = _zero_wt_init("false_negatives") + self.weights_intermediate = _zero_wt_init("weights_intermediate") + + def update_state(self, y_true, y_pred, sample_weight=None): + if self.threshold is None: + threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) + # make sure [0, 0, 0] doesn't become [1, 1, 1] + # Use abs(x) > eps, instead of x != 0 to check for zero + y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) + else: + y_pred = y_pred > self.threshold + + y_true = tf.cast(y_true, self.dtype) + y_pred = tf.cast(y_pred, self.dtype) + + def _weighted_sum(val, sample_weight): + if sample_weight is not None: + val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) + return tf.reduce_sum(val, axis=self.axis) + + self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) + self.false_positives.assign_add( + _weighted_sum(y_pred * (1 - y_true), sample_weight) + ) + self.false_negatives.assign_add( + _weighted_sum((1 - y_pred) * y_true, sample_weight) + ) + self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) + + def result(self): + precision = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_positives + ) + recall = tf.math.divide_no_nan( + self.true_positives, self.true_positives + self.false_negatives + ) + + mul_value = precision * recall + add_value = (tf.math.square(self.beta) * precision) + recall + mean = tf.math.divide_no_nan(mul_value, add_value) + f1_score = mean * (1 + tf.math.square(self.beta)) + + if self.average == "weighted": + weights = tf.math.divide_no_nan( + self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) + ) + f1_score = tf.reduce_sum(f1_score * weights) + + elif self.average is not None: # [micro, macro] + f1_score = tf.reduce_mean(f1_score) + + return f1_score + + def get_config(self): + """Returns the serializable config of the metric.""" + + config = { + "num_classes": self.num_classes, + "average": self.average, + "beta": self.beta, + "threshold": self.threshold, + } + + base_config = super().get_config() + return {**base_config, **config} + + def reset_states(self): + reset_value = tf.zeros(self.init_shape, dtype=self.dtype) + tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) + + +@protected_register_keras_serializable() +class F1Score(FBetaScore): + r"""Computes F-1 Score. + + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. + # + # Licensed under the Apache License, Version 2.0 (the "License"); + # you may not use this file except in compliance with the License. + # You may obtain a copy of the License at + # + # http://www.apache.org/licenses/LICENSE-2.0 + # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE + # + # Unless required by applicable law or agreed to in writing, software + # distributed under the License is distributed on an "AS IS" BASIS, + # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + # See the License for the specific language governing permissions and + # limitations under the License. + # ============================================================================== + + It is the harmonic mean of precision and recall. + Output range is `[0, 1]`. Works for both multi-class + and multi-label classification. + $$ + F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} + $$ + Args: + num_classes: Number of unique classes in the dataset. + average: Type of averaging to be performed on data. + Acceptable values are `None`, `micro`, `macro` + and `weighted`. Default value is None. + threshold: Elements of `y_pred` above threshold are + considered to be 1, and the rest 0. If threshold is + None, the argmax is converted to 1, and the rest 0. + name: (Optional) String name of the metric instance. + dtype: (Optional) Data type of the metric result. + Returns: + F-1 Score: float. + """ + + # Modification: remove the run-time type checking for functions + def __init__(self, num_classes, average=None, threshold=None, + name="f1_score", dtype=None): + super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) + + def get_config(self): + base_config = super().get_config() + del base_config["beta"] + return base_config From 2066271490cb7bf03af1f67402ce6bd0a05c6913 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 5 Jul 2022 13:54:06 -0500 Subject: [PATCH 04/14] fix: add util tests --- .../tests/labelers/test_labeler_utils.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index 0b999c41d..b352ebd9e 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -1,8 +1,10 @@ import unittest from unittest import mock +import logging import numpy as np import pandas as pd +import tensorflow as tf from dataprofiler.labelers import labeler_utils @@ -267,3 +269,33 @@ def test_save_conf_mat(self, mock_dataframe): self.assertDictEqual(expected_row_col_names, mock_dataframe.call_args[1]) mock_instance_df.to_csv.assert_called() + + +class TestTFFunctions(unittest.TestCase): + + def test_get_tf_layer_index_from_name(self): + model = tf.keras.Sequential() + model.add(tf.keras.Input((1, 2), name='input')) + model.add(tf.keras.layers.Dense(units=4, name='dense0')) + model.add(tf.keras.layers.Dense(units=3, name='dense1')) + + ind = labeler_utils.get_tf_layer_index_from_name(model, 'not a layer') + self.assertIsNone(ind) + + # input is not counted in the layer + ind = labeler_utils.get_tf_layer_index_from_name(model, 'input') + self.assertIsNone(ind) + + ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense1') + self.assertEqual(1, ind) + + ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense0') + self.assertEqual(0, ind) + + def test_hide_tf_logger_warnings(self): + logger = logging.getLogger('tensorflow') + self.assertListEqual([], logger.filters) + + # make change and validate updated filter + labeler_utils.hide_tf_logger_warnings() + self.assertEqual(1, len(logger.filters)) From c108d705811fe70d29e069ef55cfb5a7bbe2d9b9 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 5 Jul 2022 14:50:11 -0500 Subject: [PATCH 05/14] fix: remove comments --- dataprofiler/labelers/char_load_tf_model.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index 568b335f7..5e1bd6701 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -460,13 +460,7 @@ def predict(self, data, batch_size=32, show_confidences=False, "predict.") # Pre-allocate space for predictions confidences = [] - # sentence_lengths = np.zeros((batch_size,), dtype=int) - # predictions = np.zeros((batch_size, self._parameters['max_length'])) predictions = [] - # if show_confidences: - # confidences = np.zeros((batch_size, - # self._parameters['max_length'], - # self.num_labels)) # Run model with batching allocation_index = 0 @@ -482,16 +476,12 @@ def predict(self, data, batch_size=32, show_confidences=False, # Double array size if len(predictions) <= allocation_index: predictions += predictions - # sentence_lengths = np.pad( - # sentence_lengths, pad_width=((0, len(sentence_lengths)),), - # mode='constant') if show_confidences: confidences += confidences if show_confidences: confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy() predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy() - # sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data)) allocation_index += num_samples_in_batch @@ -502,13 +492,6 @@ def predict(self, data, batch_size=32, show_confidences=False, confidences = [confidences[i].tolist() for i in range(0, allocation_index)] - # # Append slices of predictions to return prediction & confidence matrices - # for index, sentence_length \ - # in enumerate(sentence_lengths[:allocation_index]): - # predictions_list[index] = list(predictions[index][:sentence_length]) - # if show_confidences: - # confidences_list[index] = list(confidences[index][:sentence_length]) - if show_confidences: return {'pred': predictions, 'conf': confidences} return {'pred': predictions} From 84c3d9c0614a30f1c1857aacd810cd99e6d33320 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 11:37:17 -0500 Subject: [PATCH 06/14] feat: add test for data loading --- .../test_char_load_tf_data_labeler.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py diff --git a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py new file mode 100644 index 000000000..2f8237db5 --- /dev/null +++ b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py @@ -0,0 +1,148 @@ +import json +import os +import unittest +from io import StringIO +from unittest import mock + +from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, \ + data_processing +from dataprofiler.labelers.char_load_tf_model import \ + CharLoadTFModel + + +test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + +data_labeler_parameters = { + 'model': { + 'class': 'CharLoadTFModel', + 'parameters': {} + }, + 'label_mapping': { + 'PAD': 0, + 'CITY': 1, # SAME AS UNKNOWN + 'UNKNOWN': 1, + 'ADDRESS': 2, + 'PERSON': 3, + }, + 'preprocessor': { + 'class': 'CharEncodedPreprocessor' + }, + 'postprocessor': { + 'class': 'CharPostprocessor' + }, +} + +preprocessor_parameters = { + 'encoding_map': {'t': 1, 's': 2}, + 'flatten_split': 0, + 'flatten_separator': ' ', + 'is_separate_at_max_len': True, + +} + +postprocessor_parameters = { + 'use_word_level_argmax': True, + 'output_format': 'character_argmax', + 'separators': (' ', ',', ';', "'", '"', ':', '\n', '\t', "."), + 'word_level_min_percent': 0.75, +} + + +def mock_open(filename, *args): + if filename.find('data_labeler_parameters') >= 0: + return StringIO(json.dumps(data_labeler_parameters)) + elif filename.find('preprocessor_parameters') >= 0: + return StringIO(json.dumps(preprocessor_parameters)) + elif filename.find('postprocessor_parameters') >= 0: + return StringIO(json.dumps(postprocessor_parameters)) + + +def setup_save_mock_open(mock_open): + mock_file = StringIO() + mock_file.close = lambda: None + mock_open.side_effect = lambda *args: mock_file + return mock_file + + +@mock.patch('dataprofiler.labelers.data_processing.BaseDataProcessor') +@mock.patch('dataprofiler.labelers.char_load_tf_model.' + 'CharLoadTFModel.load_from_disk') +@mock.patch("builtins.open", side_effect=mock_open) +class TestCharTFLoadDataLabeler(unittest.TestCase): + + @staticmethod + def _setup_mock_load_model(mock_load_model): + model_mock = mock.Mock(spec=CharLoadTFModel) + model_mock.set_num_labels = mock.Mock() + mock_load_model.return_value = model_mock + model_mock.requires_zero_mapping = True + model_mock.labels = ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON'] + model_mock.label_mapping = { + 'PAD': 0, + 'CITY': 1, # SAME AS UNKNOWN + 'UNKNOWN': 1, + 'ADDRESS': 2, + 'PERSON': 3, + } + model_mock.reverse_label_mapping = { + 0: 'PAD', + 1: 'UNKNOWN', + 2: 'ADDRESS', + 3: 'PERSON', + } + + @staticmethod + def _setup_mock_load_processor(mock_base_processor): + def side_effect(arg): + processor = { + 'CharEncodedPreprocessor': mock.Mock( + spec=data_processing.CharEncodedPreprocessor), + 'CharPostprocessor': mock.Mock( + spec=data_processing.CharPostprocessor), + }[arg] + processor.load_from_disk.return_value = processor + return processor + + mock_base_processor.get_class.side_effect = side_effect + + def test_load_from_disk(self, mock_open, mock_load_model, + mock_base_processor): + + self._setup_mock_load_model(mock_load_model) + self._setup_mock_load_processor(mock_base_processor) + + # load default + data_labeler = DataLabeler.load_from_disk('fake/path') + + self.assertDictEqual(data_labeler.label_mapping, + data_labeler_parameters['label_mapping']) + self.assertListEqual( + data_labeler.labels, + ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON']) + self.assertIsInstance( + data_labeler.preprocessor, data_processing.BaseDataPreprocessor) + self.assertIsInstance( + data_labeler.postprocessor, data_processing.BaseDataPostprocessor) + + def test_save_to_disk(self, mock_open, mock_load_model, + mock_load_processor, *mocks): + + self._setup_mock_load_model(mock_load_model) + self._setup_mock_load_processor(mock_load_processor) + + # call func + data_labeler = UnstructuredDataLabeler() + + # setup save mock + mock_file = setup_save_mock_open(mock_open) + + # save and test + data_labeler.save_to_disk('test/path') + self.assertEqual( + '{"model": {"class": "CharLoadTFModel"}, ' + '"preprocessor": {"class": "CharEncodedPreprocessor"}, ' + '"postprocessor": {"class": "CharPostprocessor"}}', + mock_file.getvalue()) + + # close mock + StringIO.close(mock_file) From c60e678bc31c591b6c4d5596c5ec1cd86c7d9e06 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 17:04:38 -0500 Subject: [PATCH 07/14] fix: imports --- dataprofiler/labelers/char_load_tf_model.py | 3 --- dataprofiler/labelers/character_level_cnn_model.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index 5e1bd6701..0a1f40454 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -3,13 +3,10 @@ import os import sys import time -import logging from collections import defaultdict -import functools import tensorflow as tf import numpy as np -from sklearn import decomposition from . import labeler_utils from .base_model import BaseModel, BaseTrainableModel diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py index c02fae1b9..3c5e50957 100644 --- a/dataprofiler/labelers/character_level_cnn_model.py +++ b/dataprofiler/labelers/character_level_cnn_model.py @@ -1,11 +1,9 @@ import copy import json -import logging import os import sys import time from collections import defaultdict -import functools import numpy as np import tensorflow as tf From d95e37537afd54d02ac80cb98caf5741a7fd7e4f Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 17:05:20 -0500 Subject: [PATCH 08/14] feat: add tests --- dataprofiler/labelers/char_load_tf_model.py | 19 +- .../tests/labelers/test_char_tf_load_model.py | 373 ++++++++++++++++++ 2 files changed, 384 insertions(+), 8 deletions(-) create mode 100644 dataprofiler/tests/labelers/test_char_tf_load_model.py diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index 0a1f40454..5abc1fa96 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -162,12 +162,14 @@ def save_to_disk(self, dirpath): self._reconstruct_model() model_param_dirpath = os.path.join(dirpath, "model_parameters.json") + model_parameters = self._parameters.copy() + model_parameters.pop("model_path") with open(model_param_dirpath, 'w') as fp: - json.dump(self._parameters, fp) + json.dump(model_parameters, fp) labels_dirpath = os.path.join(dirpath, "label_mapping.json") with open(labels_dirpath, 'w') as fp: json.dump(self.label_mapping, fp) - self._model.save(os.path.join(dirpath)) + self._model.save(dirpath) @classmethod def load_from_disk(cls, dirpath): @@ -194,12 +196,12 @@ def load_from_disk(cls, dirpath): "F1Score": labeler_utils.F1Score( num_classes=max(label_mapping.values()) + 1, average='micro'), - "CharacterLevelCnnModel": cls, + "CharLoadTFModel": cls, } with tf.keras.utils.custom_object_scope(custom_objects): tf_model = tf.keras.models.load_model(dirpath) - loaded_model = cls(label_mapping, parameters) + loaded_model = cls(dirpath, label_mapping, parameters) loaded_model._model = tf_model # load self @@ -448,8 +450,7 @@ def predict(self, data, batch_size=32, show_confidences=False, :rtype: dict """ if not self._model: - raise ValueError("You are trying to predict without a model. " - "Construct/Load a model before predicting.") + self._construct_model() elif self._need_to_reconstruct_model(): raise RuntimeError("The model label mapping definitions have been " "altered without additional training. Please " @@ -468,7 +469,6 @@ def predict(self, data, batch_size=32, show_confidences=False, # Count number of samples in batch to prevent array mismatch num_samples_in_batch = len(batch_data) - allocation_index = batch_id * batch_size # Double array size if len(predictions) <= allocation_index: @@ -483,7 +483,7 @@ def predict(self, data, batch_size=32, show_confidences=False, allocation_index += num_samples_in_batch # Convert predictions, confidences to lists from numpy - predictions = [predictions[i].tolist() for i in range(0, allocation_index)] + predictions = [predictions[i].tolist() for i in range(allocation_index)] confidences_list = None if show_confidences: confidences = [confidences[i].tolist() @@ -498,6 +498,9 @@ def details(self): Prints the relevant details of the model (summary, parameters, label mapping) """ + if not self._model: + self._construct_model() + print("\n###### Model Details ######\n") self._model.summary() print("\nModel Parameters:") diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py new file mode 100644 index 000000000..8f06af6f2 --- /dev/null +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -0,0 +1,373 @@ +import json +import os +import unittest +from io import StringIO +from unittest import mock + +import numpy as np +import pandas as pd +import pkg_resources +import tensorflow as tf + +from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel + +_file_dir = os.path.dirname(os.path.abspath(__file__)) +_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers") + + +mock_model_parameters = { + "model_path": "project/example/path/fake_model.h5", + "default_label": "UNKNOWN", +} + + +mock_label_mapping = { + "PAD": 0, + "CITY": 1, # ensure that overlapping labels get removed. + "UNKNOWN": 1, + "ADDRESS": 2, +} + + +def mock_tf_model(*args, **kwargs): + model = tf.keras.models.Sequential() + model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.int64)) + model.add(tf.keras.layers.Embedding( + input_dim=100, + output_dim=30, + embeddings_initializer="normal", + trainable=True)) + model.add(tf.keras.layers.Dense(units=10, activation="relu")) + model.add(tf.keras.layers.Dense(10, activation="softmax")) + return model + + +def mock_open(filename, *args): + if filename.find("model_parameters") >= 0: + return StringIO(json.dumps(mock_model_parameters)) + elif filename.find("label_mapping") >= 0: + return StringIO(json.dumps(mock_label_mapping)) + + +def setup_save_mock_open(mock_open): + mock_file = StringIO() + mock_file.close = lambda: None + mock_open.side_effect = lambda *args: mock_file + return mock_file + + +@mock.patch('tensorflow.keras.models.load_model', side_effect=mock_tf_model) +class TestCharLoadTFModel(unittest.TestCase): + @classmethod + def setUpClass(cls): + # data + cls.df = pd.DataFrame( + { + 0: [ + "MUCH xerophytic GOOFPROOF. Ranch Declarerevise health WITH " + "zinc Rhizoctinia.INCULCATION suntrapMordacity `GUAN... " + "NECROMANTIC` HAVE mastopathy_nonfeasance_DEMOCRAT 26/09/95 " + "18:16 HE sugarcoat [8eec39e5-8acc-40ca-b424-7171ac49131b] " + "ourselves" + ], + 1: [[[164, 178, "DATETIME"], [193, 229, "UUID"]]], + } + ) + cls.label_mapping = { + "PAD": 0, + "CITY": 1, # SAME AS UNKNOWN, ensure that overlapping + "UNKNOWN": 1, # labels get removed. + "ADDRESS": 2, + "BAN": 3, + "CREDIT_CARD": 4, + "EMAIL_ADDRESS": 5, + "UUID": 6, + "HASH_OR_KEY": 7, + "IPV4": 8, + "IPV6": 9, + "MAC_ADDRESS": 10, + "NAME": 11, # SAME AS PERSON + "PERSON": 11, + "PHONE_NUMBER": 12, + "SSN": 13, + "URL": 14, + "DATETIME": 15, + "INTEGER_BIG": 16, # SAME AS INTEGER + } + cls.model_path = "project/example/path/fake_model.h5" + + def test_init(self, *mocks): + + # load default + model = CharLoadTFModel(self.model_path, self.label_mapping) + expected_labels = [ + "PAD", + "UNKNOWN", + "ADDRESS", + "BAN", + "CREDIT_CARD", + "EMAIL_ADDRESS", + "UUID", + "HASH_OR_KEY", + "IPV4", + "IPV6", + "MAC_ADDRESS", + "PERSON", + "PHONE_NUMBER", + "SSN", + "URL", + "DATETIME", + "INTEGER_BIG", + ] + + self.assertDictEqual(self.label_mapping, model.label_mapping) + self.assertEqual(self.model_path, model._parameters['model_path']) + self.assertListEqual(expected_labels, model.labels) + + def test_reverse_label_mapping(self, *mocks): + + # load default + model = CharLoadTFModel(self.model_path, self.label_mapping) + + # should notice that CITY does not exist in reverse + expected_reverse_label_mapping = { + 0: "PAD", + 1: "UNKNOWN", + 2: "ADDRESS", + 3: "BAN", + 4: "CREDIT_CARD", + 5: "EMAIL_ADDRESS", + 6: "UUID", + 7: "HASH_OR_KEY", + 8: "IPV4", + 9: "IPV6", + 10: "MAC_ADDRESS", + 11: "PERSON", + 12: "PHONE_NUMBER", + 13: "SSN", + 14: "URL", + 15: "DATETIME", + 16: "INTEGER_BIG", + } + + self.assertDictEqual( + expected_reverse_label_mapping, + model.reverse_label_mapping + ) + + def test_set_label_mapping(self, *mocks): + + # load default + model = CharLoadTFModel(self.model_path, self.label_mapping) + + # test not dict + label_mapping = None + with self.assertRaisesRegex( + TypeError, + "Labels must either be a non-empty encoding dict " + "which maps labels to index encodings or a list.", + ): + model.set_label_mapping(label_mapping) + + # test label_mapping without PAD + label_mapping = { + "CITY": 1, # SAME AS UNKNOWN + "UNKNOWN": 1, + "ADDRESS": 2, + } + model.set_label_mapping(label_mapping) + label_mapping["PAD"] = 0 + self.assertDictEqual(label_mapping, model.label_mapping) + + # test list without pad sets PAD: 0 + labels = [ + "UNKNOWN", + "ADDRESS", + ] + label_mapping = { + "PAD": 1, + "UNKNOWN": 2, + "ADDRESS": 3, + } + model.set_label_mapping(labels) + self.assertDictEqual(label_mapping, model.label_mapping) + + # test label_mapping with PAD: 0 + label_mapping = { + "PAD": 0, + "CITY": 1, # SAME AS UNKNOWN + "UNKNOWN": 1, + "ADDRESS": 2, + } + model.set_label_mapping(label_mapping) + self.assertDictEqual(label_mapping, model.label_mapping) + + # test if pad not set, but 0 taken set to last ind + # test label_mapping without PAD + label_mapping = { + "CITY": 0, + "UNKNOWN": 1, + "ADDRESS": 2, + } + model.set_label_mapping(label_mapping) + label_mapping["PAD"] = 3 + self.assertDictEqual(label_mapping, model.label_mapping) + + def test_predict(self, *mocks): + # model + model = CharLoadTFModel(self.model_path, self.label_mapping) + data_gen = [np.array([[1, 3], [1, 2]])] + result = model.predict(data_gen) + self.assertIn("pred", result) + self.assertEqual((2, 2), np.array(result['pred']).shape) + + result = model.predict(data_gen, show_confidences=True) + self.assertIn("pred", result) + self.assertIn("conf", result) + self.assertEqual( + (2, 2, model.num_labels), + np.array(result['conf']).shape + ) + + def test_fit_and_predict(self, *mocks): + # model + model = CharLoadTFModel(self.model_path, self.label_mapping) + + # data for model + data_gen = [ + [ + np.array([[1, 3], [1, 2]]), # x_data + np.zeros((2, 2, model.num_labels)), # y_data + ] + ] + cv_gen = data_gen + + # Basic Fit with Validation Data + with self.assertLogs( + "DataProfiler.labelers.char_load_tf_model", level="INFO" + ) as logs: + history, f1, f1_report = model.fit(data_gen, cv_gen, reset_weights=True) + + # Ensure info was logged during fit + self.assertTrue(len(logs.output)) + + data_gen = [np.array([[1, 3], [1, 2]])] + model.predict(data_gen) + + # fit with new labels + new_label_mapping = { + "PAD": 0, + "TEST": 1, + "NEW": 2, + "MAPPING": 3, + model._parameters['default_label']: 4, + } + data_gen = [ + [ + np.array([[1, 3], [1, 2]]), # x_data + np.zeros((2, 2, len(new_label_mapping))), # y_data + ] + ] + history, f1, f1_report = model.fit( + data_gen, cv_gen, label_mapping=new_label_mapping + ) + + # predict after fitting on just the text + model.predict(data_gen[0][0]) + + @mock.patch("os.makedirs", return_value=None) + def test_validation_evaluate_and_classification_report(self, *mocks): + model = CharLoadTFModel(self.model_path, self.label_mapping) + model._construct_model() # must make model to do priv validate func + + # validation data + val_gen = [ + [ + np.ones((2, 20)), # x_data + np.zeros((2, 20, model.num_labels)), # y_data + ] + ] + val_gen[0][1][0, :11, self.label_mapping["ADDRESS"]] = 1 + + f1, f1_report = model._validate_training(val_gen, 32, True, True) + self.assertIsNotNone(f1) + self.assertIsNotNone(f1_report) + self.assertEqual(11, f1_report["ADDRESS"]["support"]) + + def test_param_validation(self, *mocks): + # Make sure all parameters can be altered. Make sure non-valid params + # are caught + parameters = { + "default_label": "UNKNOWN", + } + invalid_parameters = { + "fake_extra_param": "fails", + } + model = CharLoadTFModel( + self.model_path, + label_mapping=self.label_mapping, + parameters=parameters + ) + model._construct_model() + self.assertDictEqual(parameters, model._parameters) + with self.assertRaises(ValueError): + CharLoadTFModel( + self.model_path, + label_mapping=self.label_mapping, + parameters=invalid_parameters + ) + + @mock.patch("sys.stdout", new_callable=StringIO) + def test_help(self, mock_stdout, *mocks): + CharLoadTFModel.help() + self.assertIn("CharLoadTFModel", mock_stdout.getvalue()) + self.assertIn("Parameters", mock_stdout.getvalue()) + + @mock.patch("tensorflow.keras.Model.save", return_value=None) + @mock.patch("builtins.open") + def test_save(self, mock_open, mock_tf_save, *mocks): + # setup mock + mock_file = setup_save_mock_open(mock_open) + + # Save and load a CNN Model with custom parameters + parameters = {} + label_mapping = mock_label_mapping + model = CharLoadTFModel(self.model_path, label_mapping, parameters) + + # save file and test + save_path = "./fake/path" + model.save_to_disk(save_path) + self.assertEqual( + # model parameters + '{"default_label": "UNKNOWN", "pad_label": "PAD"}' + # label_mapping + '{"PAD": 0, "CITY": 1, "UNKNOWN": 1, "ADDRESS": 2}', + mock_file.getvalue(), + ) + mock_tf_save.assert_called_with(save_path) + + # close mock + StringIO.close(mock_file) + + @mock.patch("tensorflow.keras.Model.save", return_value=None) + @mock.patch("builtins.open", side_effect=mock_open) + def test_load(self, *mocks): + dir = "fake/path/" + loaded_model = CharLoadTFModel.load_from_disk(dir) + self.assertIsInstance(loaded_model, CharLoadTFModel) + + @mock.patch("sys.stdout", new_callable=StringIO) + def test_model_details(self, mock_stdout, *mocks): + # Default Model Construct + model = CharLoadTFModel(self.model_path, self.label_mapping) + + # Test Details + model.details() + self.assertIn("input", mock_stdout.getvalue()) + self.assertIn("dense", mock_stdout.getvalue()) + self.assertIn("softmax_output", mock_stdout.getvalue()) + self.assertIn("Total params", mock_stdout.getvalue()) + + +if __name__ == "__main__": + unittest.main() From 4f2fd20d941a90387650227b4b05b0a463a873d7 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 17:08:19 -0500 Subject: [PATCH 09/14] fix: remove unneeded file --- .../labelers/pre_encoded_char_cnn_model.py | 839 ------------------ 1 file changed, 839 deletions(-) delete mode 100644 dataprofiler/labelers/pre_encoded_char_cnn_model.py diff --git a/dataprofiler/labelers/pre_encoded_char_cnn_model.py b/dataprofiler/labelers/pre_encoded_char_cnn_model.py deleted file mode 100644 index a8ecb96de..000000000 --- a/dataprofiler/labelers/pre_encoded_char_cnn_model.py +++ /dev/null @@ -1,839 +0,0 @@ -import json -import copy -import os -import sys -import time -import logging -from collections import defaultdict -import functools - -import tensorflow as tf -import numpy as np -from sklearn import decomposition - -from . import labeler_utils -from .base_model import BaseModel, BaseTrainableModel -from .base_model import AutoSubRegistrationMeta -from .. import dp_logging - -_file_dir = os.path.dirname(os.path.abspath(__file__)) - -logger = dp_logging.get_child_logger(__name__) - - -class NoV1ResourceMessageFilter(logging.Filter): - """Removes TF2 warning for using TF1 model which has resources.""" - def filter(self, record): - msg = 'is a problem, consider rebuilding the SavedModel after ' + \ - 'running tf.compat.v1.enable_resource_variables()' - return msg not in record.getMessage() - - -tf_logger = logging.getLogger('tensorflow') -tf_logger.addFilter(NoV1ResourceMessageFilter()) - - -def protected_register_keras_serializable(package='Custom', name=None): - """ - Protects against already registered keras serializable layers. This - ensures that if it was already registered, it will not try to register it - again. - """ - def decorator(arg): - """Protects against double registration of a keras layer.""" - class_name = name if name is not None else arg.__name__ - registered_name = package + '>' + class_name - if tf.keras.utils.get_registered_object(registered_name) is None: - tf.keras.utils.register_keras_serializable(package, name)(arg) - return arg - return decorator - - -@protected_register_keras_serializable() -class FBetaScore(tf.keras.metrics.Metric): - r"""Computes F-Beta score. - Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283 - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the weighted harmonic mean of precision - and recall. Output range is `[0, 1]`. Works for - both multi-class and multi-label classification. - $$ - F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` and - `weighted`. Default value is None. - beta: Determines the weight of precision and recall - in harmonic mean. Determines the weight given to the - precision and recall. Default value is 1. - threshold: Elements of `y_pred` greater than threshold are - converted to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-Beta Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__(self, num_classes, average=None, beta=1.0, threshold=None, - name="fbeta_score", dtype=None, **kwargs): - super().__init__(name=name, dtype=dtype) - - if average not in (None, "micro", "macro", "weighted"): - raise ValueError( - "Unknown average type. Acceptable values " - "are: [None, 'micro', 'macro', 'weighted']" - ) - - if not isinstance(beta, float): - raise TypeError("The value of beta should be a python float") - - if beta <= 0.0: - raise ValueError("beta value should be greater than zero") - - if threshold is not None: - if not isinstance(threshold, float): - raise TypeError("The value of threshold should be a python float") - if threshold > 1.0 or threshold <= 0.0: - raise ValueError("threshold should be between 0 and 1") - - self.num_classes = num_classes - self.average = average - self.beta = beta - self.threshold = threshold - self.axis = None - self.init_shape = [] - - if self.average != "micro": - self.axis = 0 - self.init_shape = [self.num_classes] - - def _zero_wt_init(name): - return self.add_weight( - name, shape=self.init_shape, initializer="zeros", dtype=self.dtype - ) - - self.true_positives = _zero_wt_init("true_positives") - self.false_positives = _zero_wt_init("false_positives") - self.false_negatives = _zero_wt_init("false_negatives") - self.weights_intermediate = _zero_wt_init("weights_intermediate") - - def update_state(self, y_true, y_pred, sample_weight=None): - if self.threshold is None: - threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True) - # make sure [0, 0, 0] doesn't become [1, 1, 1] - # Use abs(x) > eps, instead of x != 0 to check for zero - y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12) - else: - y_pred = y_pred > self.threshold - - y_true = tf.cast(y_true, self.dtype) - y_pred = tf.cast(y_pred, self.dtype) - - def _weighted_sum(val, sample_weight): - if sample_weight is not None: - val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1)) - return tf.reduce_sum(val, axis=self.axis) - - self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight)) - self.false_positives.assign_add( - _weighted_sum(y_pred * (1 - y_true), sample_weight) - ) - self.false_negatives.assign_add( - _weighted_sum((1 - y_pred) * y_true, sample_weight) - ) - self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight)) - - def result(self): - precision = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_positives - ) - recall = tf.math.divide_no_nan( - self.true_positives, self.true_positives + self.false_negatives - ) - - mul_value = precision * recall - add_value = (tf.math.square(self.beta) * precision) + recall - mean = tf.math.divide_no_nan(mul_value, add_value) - f1_score = mean * (1 + tf.math.square(self.beta)) - - if self.average == "weighted": - weights = tf.math.divide_no_nan( - self.weights_intermediate, tf.reduce_sum(self.weights_intermediate) - ) - f1_score = tf.reduce_sum(f1_score * weights) - - elif self.average is not None: # [micro, macro] - f1_score = tf.reduce_mean(f1_score) - - return f1_score - - def get_config(self): - """Returns the serializable config of the metric.""" - - config = { - "num_classes": self.num_classes, - "average": self.average, - "beta": self.beta, - "threshold": self.threshold, - } - - base_config = super().get_config() - return {**base_config, **config} - - def reset_states(self): - reset_value = tf.zeros(self.init_shape, dtype=self.dtype) - tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables]) - - -@protected_register_keras_serializable() -class F1Score(FBetaScore): - r"""Computes F-1 Score. - - # Copyright 2019 The TensorFlow Authors. All Rights Reserved. - # - # Licensed under the Apache License, Version 2.0 (the "License"); - # you may not use this file except in compliance with the License. - # You may obtain a copy of the License at - # - # http://www.apache.org/licenses/LICENSE-2.0 - # https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE - # - # Unless required by applicable law or agreed to in writing, software - # distributed under the License is distributed on an "AS IS" BASIS, - # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - # See the License for the specific language governing permissions and - # limitations under the License. - # ============================================================================== - - It is the harmonic mean of precision and recall. - Output range is `[0, 1]`. Works for both multi-class - and multi-label classification. - $$ - F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}} - $$ - Args: - num_classes: Number of unique classes in the dataset. - average: Type of averaging to be performed on data. - Acceptable values are `None`, `micro`, `macro` - and `weighted`. Default value is None. - threshold: Elements of `y_pred` above threshold are - considered to be 1, and the rest 0. If threshold is - None, the argmax is converted to 1, and the rest 0. - name: (Optional) String name of the metric instance. - dtype: (Optional) Data type of the metric result. - Returns: - F-1 Score: float. - """ - - # Modification: remove the run-time type checking for functions - def __init__(self, num_classes, average=None, threshold=None, - name="f1_score", dtype=None): - super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype) - - def get_config(self): - base_config = super().get_config() - del base_config["beta"] - return base_config - - -class PreEncodedCharCnnModel(BaseTrainableModel, - metaclass=AutoSubRegistrationMeta): - - # boolean if the label mapping requires the mapping for index 0 reserved - requires_zero_mapping = True - - def __init__(self, label_mapping=None, parameters=None): - """ - CNN Model Initializer. initialize epoch_id - - :param label_mapping: maps labels to their encoded integers - :type label_mapping: dict - :param parameters: Contains all the appropriate parameters for the - model. Must contain num_labels. Other possible parameters are: - max_length, max_char_encoding_id, dim_embed, size_fc - dropout, size_conv, num_fil, optimizer, default_label - :type parameters: dict - :return: None - """ - - # parameter initialization - if not parameters: - parameters = {} - parameters.setdefault('max_length', 1014) - parameters.setdefault('alphabet_size', 69) - parameters.setdefault('dim_embed', 32) - parameters.setdefault('conv_layers', [ - [256, 7, 1], - [256, 7, 1], - [256, 3, -1], - [256, 3, -1], - [256, 3, -1], - [256, 3, 1] - ]) - parameters.setdefault('size_fc', [512, 512]) - parameters.setdefault('dropout', 0.5) - parameters.setdefault('threshold', 1e-6) - parameters.setdefault('default_label', "UNKNOWN") - parameters['pad_label'] = 'PAD' - self._epoch_id = 0 - - # reconstruct flags for model - self._model_num_labels = 0 - self._model_default_ind = -1 - - BaseModel.__init__(self, label_mapping, parameters) - - def __eq__(self, other): - """ - Checks if two models are equal with one another, may only check - important variables, i.e. may not check model itself. - - :param self: a model - :param other: a model - :type self: BaseModel - :type other: BaseModel - :return: Whether or not self and other are equal - :rtype: bool - """ - if self._parameters != other._parameters \ - or self._label_mapping != other._label_mapping: - return False - return True - - def _validate_parameters(self, parameters): - """ - Validate the parameters sent in. Raise error if invalid parameters are - present. - - :param parameters: parameter dict containing the following parameters: - max_length: Maximum char length in a sample - max_char_encoding_id: Maximum integer value for encoding the input - dim_embed: Number of embedded dimensions - size_fc: Size of each fully connected layers - dropout: Ratio of dropout in the model - size_conv: Convolution kernel size - default_label: Key for label_mapping that is the default label - pad_label: Key for entities_dict that is the pad label - num_fil: Number of filters in each convolution layer - :type parameters: dict - :return: None - """ - errors = [] - list_of_necessary_params = ['max_length', 'alphabet_size', - 'dim_embed', 'size_fc', 'dropout', - 'threshold', 'conv_layers', 'default_label', - 'pad_label'] - # Make sure the necessary parameters are present and valid. - for param in parameters: - if param in ['max_length', 'alphabet_size', 'dim_embed']: - if not isinstance(parameters[param], (int, float)) \ - or parameters[param] < 0: - errors.append(param + " must be a valid integer or float " - "greater than 0.") - elif param in ['dropout', 'threshold']: - if not isinstance(parameters[param], (int, float)) \ - or parameters[param] < 0 or parameters[param] > 1: - errors.append(param + " must be a valid integer or float " - "from 0 to 1.") - elif param == 'size_fc': - if not isinstance(parameters[param], list) \ - or len(parameters[param]) == 0: - errors.append(param + " must be a non-empty list of " - "integers.") - else: - for item in parameters[param]: - if not isinstance(item, int): - errors.append(param + " must be a non-empty " - "list of integers.") - break - elif param == 'conv_layers': - is_bad_conv_layers = True - if isinstance(parameters[param], list): - is_bad_conv_layers = False - for layer in parameters[param]: - if (not isinstance(layer, list) or len(layer) != 3 - or any([not isinstance(x, int) for x in layer])): - is_bad_conv_layers = True - if is_bad_conv_layers: - errors.append(param + " must be a non-empty list of " - "tuples containing 3 integers.") - elif param == 'default_label': - if not isinstance(parameters[param], str): - error = str(param) + " must be a string." - errors.append(error) - - # Error if there are extra parameters thrown in - for param in parameters: - if param not in list_of_necessary_params: - errors.append(param + " is not an accepted parameter.") - if errors: - raise ValueError('\n'.join(errors)) - - def set_label_mapping(self, label_mapping): - """ - Sets the labels for the model - - :param label_mapping: label mapping of the model - :type label_mapping: dict - :return: None - """ - if not isinstance(label_mapping, (list, dict)): - raise TypeError("Labels must either be a non-empty encoding dict " - "which maps labels to index encodings or a list.") - - label_mapping = copy.deepcopy(label_mapping) - if 'PAD' not in label_mapping: - if isinstance(label_mapping, list): # if list missing PAD - label_mapping = ['PAD'] + label_mapping - elif 0 not in label_mapping.values(): # if dict missing PAD and 0 - label_mapping.update({'PAD': 0}) - if (isinstance(label_mapping, dict) - and label_mapping.get('PAD', None) != 0): # dict with bad PAD - raise ValueError("`PAD` must map to index zero.") - if self._parameters['default_label'] not in label_mapping: - raise ValueError("The `default_label` of {} must exist in the " - "label mapping.".format( - self._parameters['default_label'])) - super().set_label_mapping(label_mapping) - - def _need_to_reconstruct_model(self): - """ - Determines whether or not the model needs to be reconstructed. - - :return: bool of whether or not the model needs to reconstruct. - """ - if not self._model: - return False - default_ind = self.label_mapping[self._parameters['default_label']] - return self.num_labels != self._model_num_labels or \ - default_ind != self._model_default_ind - - def save_to_disk(self, dirpath): - """ - Saves whole model to disk with weights - - :param dirpath: directory path where you want to save the model to - :type dirpath: str - :return: None - """ - if not self._model: - self._construct_model() - elif self._need_to_reconstruct_model(): - self._reconstruct_model() - - model_param_dirpath = os.path.join(dirpath, "model_parameters.json") - with open(model_param_dirpath, 'w') as fp: - json.dump(self._parameters, fp) - labels_dirpath = os.path.join(dirpath, "label_mapping.json") - with open(labels_dirpath, 'w') as fp: - json.dump(self.label_mapping, fp) - self._model.save(os.path.join(dirpath)) - - @classmethod - def load_from_disk(cls, dirpath): - """ - Loads whole model from disk with weights - - :param dirpath: directory path where you want to load the model from - :type dirpath: str - :return: None - """ - - # load parameters - model_param_dirpath = os.path.join(dirpath, "model_parameters.json") - with open(model_param_dirpath, 'r') as fp: - parameters = json.load(fp) - - # load label_mapping - labels_dirpath = os.path.join(dirpath, "label_mapping.json") - with open(labels_dirpath, 'r') as fp: - label_mapping = json.load(fp) - - # use f1 score metric - custom_objects = { - "F1Score": F1Score( - num_classes=max(label_mapping.values()) + 1, - average='micro'), - "CharacterLevelCnnModel": cls, - } - with tf.keras.utils.custom_object_scope(custom_objects): - tf_model = tf.keras.models.load_model(dirpath) - - loaded_model = cls(label_mapping, parameters) - loaded_model._model = tf_model - # - # # Tensorflow v1 Model weights need to be transferred. - # if not callable(tf_model): - # loaded_model._construct_model() - # tf1_weights = [] - # for var in tf_model.variables: - # if 'training' not in var.name: - # tf1_weights.append(var.value()) - # - # loaded_model._construct_model() - # tf1_weights.append(loaded_model._model.weights[-1].value()) - # loaded_model._model.set_weights(tf1_weights) - - # load self - loaded_model._model_num_labels = loaded_model.num_labels - loaded_model._model_default_ind = loaded_model.label_mapping[ - loaded_model._parameters['default_label'] - ] - return loaded_model - - def _construct_model(self): - """ - Model constructor for the data labeler. This also serves as a weight - reset. - - :return: None - """ - num_labels = self.num_labels - default_ind = self.label_mapping[self._parameters['default_label']] - - # default parameters - max_length = self._parameters['max_length'] - alphabet_size = self._parameters['alphabet_size'] - dim_embed = self._parameters['dim_embed'] - conv_layers = self._parameters['conv_layers'] - size_fc = self._parameters['size_fc'] - threshold = self._parameters['threshold'] - dropout = self._parameters['dropout'] - - # Reset model - tf.keras.backend.clear_session() - - # Input layer - inputs = tf.keras.layers.Input( - shape=(None,), name='sent_input', dtype='int64') - # Embedding layers - x_embedding = tf.keras.layers.Embedding( - alphabet_size + 1, dim_embed, input_length=max_length)(inputs) - - # Convolution layers - x = x_embedding - for cl in conv_layers: - x = tf.keras.layers.Convolution1D(cl[0], cl[1], padding='same')(x) - x = tf.keras.layers.ThresholdedReLU(threshold)(x) - if cl[2] != -1: - x = tf.keras.layers.MaxPooling1D(cl[2])(x) - # x = tf.keras.layers.Flatten()(x) - - # Fully connected layers - for fl in size_fc: - x_dense = tf.keras.layers.Dense(fl)(x) - x = tf.keras.layers.ThresholdedReLU(threshold)(x_dense) - x = tf.keras.layers.Dropout(dropout)(x) - - # Output layer - predictions = tf.keras.layers.Dense( - num_labels, activation='softmax', name='softmax_output')(x) - # argmax layer - argmax_layer = tf.keras.backend.argmax(predictions) - - # Build and compile model - self._model = tf.keras.models.Model( - inputs=inputs, outputs=[predictions, argmax_layer]) - - # Compile the model w/ metrics - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} - - # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average='micro') - metrics = {softmax_output_layer_name: ['acc', f1_score_training]} - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) - - self._epoch_id = 0 - self._model_num_labels = num_labels - self._model_default_ind = default_ind - - def reset_weights(self): - """ - Reset the weights of the model. - - :return: None - """ - self._construct_model() - - def _reconstruct_model(self): - """ - Reconstruct the appropriate layers if the number of number of labels is - altered - - :return: None - """ - - # Reset model - tf.keras.backend.clear_session() - - num_labels = self.num_labels - default_ind = self.label_mapping[self._parameters['default_label']] - - # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax') - for _ in range(2): - self._model.layers.pop() - - # Add the final Softmax layer to the previous spot - final_softmax_layer = tf.keras.layers.Dense( - num_labels, activation='softmax', name="softmax_output")( - self._model.layers[-4].output) - - # Output the model into a .pb file for TensorFlow - argmax_layer = tf.keras.backend.argmax(final_softmax_layer) - - - argmax_outputs = [final_softmax_layer, argmax_layer] - self._model = tf.keras.Model(self._model.inputs, argmax_outputs) - - # Compile the model - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] - losses = {softmax_output_layer_name: "categorical_crossentropy"} - - # use f1 score metric - f1_score_training = F1Score(num_classes=num_labels, average='micro') - metrics = {softmax_output_layer_name: ['acc', f1_score_training]} - - self._model.compile(loss=losses, optimizer="adam", metrics=metrics) - - self._epoch_id = 0 - self._model_num_labels = num_labels - self._model_default_ind = default_ind - - def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, - reset_weights=False, verbose=True): - """ - Train the current model with the training data and validation data - - :param train_data: Training data used to train model - :type train_data: Union[list, np.ndarray] - :param val_data: Validation data used to validate the training - :type val_data: Union[list, np.ndarray] - :param batch_size: Used to determine number of samples in each batch - :type batch_size: int - :param label_mapping: maps labels to their encoded integers - :type label_mapping: Union[dict, None] - :param reset_weights: Flag to determine whether to reset the weights or - not - :type reset_weights: bool - :param verbose: Flag to determine whether to print status or not - :type verbose: bool - :return: None - """ - - if label_mapping is not None: - self.set_label_mapping(label_mapping) - - if not self._model: - self._construct_model() - else: - if self._need_to_reconstruct_model(): - self._reconstruct_model() - if reset_weights: - self.reset_weights() - - history = defaultdict() - f1 = None - f1_report = [] - - self._model.reset_metrics() - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] - - start_time = time.time() - batch_id = 0 - for x_train, y_train in train_data: - model_results = self._model.train_on_batch( - x_train, {softmax_output_layer_name: y_train}) - sys.stdout.flush() - if verbose: - sys.stdout.write( - "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " - "f1_score %f" % - (self._epoch_id, batch_id, *model_results[1:])) - batch_id += 1 - - for i, metric_label in enumerate(self._model.metrics_names): - history[metric_label] = model_results[i] - - if val_data: - f1, f1_report = self._validate_training(val_data) - history['f1_report'] = f1_report - - val_f1 = f1_report['weighted avg']['f1-score'] \ - if f1_report else np.NAN - val_precision = f1_report['weighted avg']['precision'] \ - if f1_report else np.NAN - val_recall = f1_report['weighted avg']['recall'] \ - if f1_report else np.NAN - epoch_time = time.time() - start_time - logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " - "val_f1: %f - val_precision: %f - val_recall %f" % - (self._epoch_id, epoch_time, *model_results[1:], - val_f1, val_precision, val_recall)) - - self._epoch_id += 1 - - return history, f1, f1_report - - def _validate_training(self, val_data, batch_size_test=32, - verbose_log=True, verbose_keras=False): - """ - Validate the model on the test set and return the evaluation metrics. - - :param val_data: data generator for the validation - :type val_data: iterator - :param batch_size_test: Number of samples to process in testing - :type batch_size_test: int - :param verbose_log: whether or not to print out scores for training, - etc. - :type verbose_log: bool - :param verbose_keras: whether or not to print out scores for training, - from keras. - :type verbose_keras: bool - return (f1-score, f1 report). - """ - f1 = None - f1_report = None - - if val_data is None: - return f1, f1_report - - # Predict on the test set - batch_id = 0 - y_val_pred = [] - y_val_test = [] - for x_val, y_val in val_data: - y_val_pred.append(self._model.predict( - x_val, batch_size=batch_size_test, verbose=verbose_keras)[1]) - y_val_test.append(np.argmax(y_val, axis=-1)) - batch_id += 1 - sys.stdout.flush() - if verbose_log: - sys.stdout.write("\rEPOCH %g, validation_batch_id %d" % - (self._epoch_id, batch_id)) - - tf.keras.backend.set_floatx('float32') - # Clean the predicted entities and the actual entities - f1, f1_report = labeler_utils.evaluate_accuracy( - np.concatenate(y_val_pred, axis=0), - np.concatenate(y_val_test, axis=0), - self.num_labels, - self.reverse_label_mapping, - verbose=verbose_keras) - - return f1, f1_report - - def predict(self, data, batch_size=32, show_confidences=False, - verbose=True): - """ - Run model and get predictions - - :param data: text input - :type data: Union[list, numpy.ndarray] - :param batch_size: number of samples in the batch of data - :type batch_size: int - :param show_confidences: whether user wants prediction confidences - :type show_confidences: - :param verbose: Flag to determine whether to print status or not - :type verbose: bool - :return: char level predictions and confidences - :rtype: dict - """ - if not self._model: - raise ValueError("You are trying to predict without a model. " - "Construct/Load a model before predicting.") - elif self._need_to_reconstruct_model(): - raise RuntimeError("The model label mapping definitions have been " - "altered without additional training. Please " - "train the model or reset the label mapping to " - "predict.") - # Pre-allocate space for predictions - confidences = [] - sentence_lengths = np.zeros((batch_size,), dtype=int) - predictions = np.zeros((batch_size, self._parameters['max_length'])) - if show_confidences: - confidences = np.zeros((batch_size, - self._parameters['max_length'], - self.num_labels)) - - # Run model with batching - allocation_index = 0 - for batch_id, batch_data in enumerate(data): - model_output = self._model( - tf.convert_to_tensor(batch_data) - ) - - # Count number of samples in batch to prevent array mismatch - num_samples_in_batch = len(batch_data) - allocation_index = batch_id * batch_size - - # Double array size - if len(predictions) <= allocation_index: - predictions = np.pad(predictions, ((0, len(predictions)), - (0, 0)), mode='constant') - sentence_lengths = np.pad( - sentence_lengths, pad_width=((0, len(sentence_lengths)),), - mode='constant') - if show_confidences: - confidences = np.pad(confidences, - ((0, len(predictions)), - (0, 0), (0, 0)), mode='constant') - - if show_confidences: - confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy() - predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy() - sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data)) - - allocation_index += num_samples_in_batch - - # Convert predictions, confidences to lists from numpy - predictions_list = [i for i in range(0, allocation_index)] - confidences_list = None - if show_confidences: - confidences_list = [i for i in range(0, allocation_index)] - - # Append slices of predictions to return prediction & confidence matrices - for index, sentence_length \ - in enumerate(sentence_lengths[:allocation_index]): - predictions_list[index] = list(predictions[index][:sentence_length]) - if show_confidences: - confidences_list[index] = list(confidences[index][:sentence_length]) - - if show_confidences: - return {'pred': predictions_list, 'conf': confidences_list} - return {'pred': predictions_list} - - def details(self): - """ - Prints the relevant details of the model (summary, parameters, label - mapping) - """ - print("\n###### Model Details ######\n") - self._model.summary() - print("\nModel Parameters:") - for key, value in self._parameters.items(): - print("{}: {}".format(key, value)) - print("\nModel Label Mapping:") - for key, value in self.label_mapping.items(): - print("{}: {}".format(key, value)) From 9a17b74a531563c903d6d5f74885d70321fd278e Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 17:16:24 -0500 Subject: [PATCH 10/14] fix: f1score imports --- dataprofiler/tests/labelers/test_f_scores.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/tests/labelers/test_f_scores.py b/dataprofiler/tests/labelers/test_f_scores.py index 241d3db50..e707dc1ea 100644 --- a/dataprofiler/tests/labelers/test_f_scores.py +++ b/dataprofiler/tests/labelers/test_f_scores.py @@ -22,7 +22,7 @@ import numpy as np import tensorflow as tf -from dataprofiler.labelers.character_level_cnn_model import F1Score, FBetaScore +from dataprofiler.labelers.labeler_utils import F1Score, FBetaScore class TestFScore(unittest.TestCase): From 0b7a44ff0385bea510695d4420ea1e1ea456b367 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Tue, 12 Jul 2022 17:32:32 -0500 Subject: [PATCH 11/14] fix: logger count --- dataprofiler/tests/labelers/test_labeler_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py index b352ebd9e..58c32a7a3 100644 --- a/dataprofiler/tests/labelers/test_labeler_utils.py +++ b/dataprofiler/tests/labelers/test_labeler_utils.py @@ -294,8 +294,8 @@ def test_get_tf_layer_index_from_name(self): def test_hide_tf_logger_warnings(self): logger = logging.getLogger('tensorflow') - self.assertListEqual([], logger.filters) + num_loggers = len(logger.filters) # make change and validate updated filter labeler_utils.hide_tf_logger_warnings() - self.assertEqual(1, len(logger.filters)) + self.assertEqual(1 + num_loggers, len(logger.filters)) From ade62e40244aa04cd27eb7f37509f8d17b531f10 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 13 Jul 2022 09:36:28 -0500 Subject: [PATCH 12/14] fix: reformat with isort / black --- dataprofiler/labelers/char_load_tf_model.py | 220 +++++++++++--------- 1 file changed, 123 insertions(+), 97 deletions(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index 5abc1fa96..01e2f6bb8 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -1,17 +1,16 @@ -import json import copy +import json import os import sys import time from collections import defaultdict -import tensorflow as tf import numpy as np +import tensorflow as tf -from . import labeler_utils -from .base_model import BaseModel, BaseTrainableModel -from .base_model import AutoSubRegistrationMeta from .. import dp_logging +from . import labeler_utils +from .base_model import AutoSubRegistrationMeta, BaseModel, BaseTrainableModel _file_dir = os.path.dirname(os.path.abspath(__file__)) @@ -19,14 +18,12 @@ labeler_utils.hide_tf_logger_warnings() -class CharLoadTFModel(BaseTrainableModel, - metaclass=AutoSubRegistrationMeta): +class CharLoadTFModel(BaseTrainableModel, metaclass=AutoSubRegistrationMeta): # boolean if the label mapping requires the mapping for index 0 reserved requires_zero_mapping = False - def __init__(self, model_path, label_mapping=None, - parameters=None): + def __init__(self, model_path, label_mapping=None, parameters=None): """ Loadable TF Model Initializer. @@ -45,9 +42,9 @@ def __init__(self, model_path, label_mapping=None, # parameter initialization if not parameters: parameters = {} - parameters.setdefault('default_label', "UNKNOWN") - parameters['model_path'] = model_path - parameters['pad_label'] = 'PAD' + parameters.setdefault("default_label", "UNKNOWN") + parameters["model_path"] = model_path + parameters["pad_label"] = "PAD" self._epoch_id = 0 # reconstruct flags for model @@ -68,8 +65,10 @@ def __eq__(self, other): :return: Whether or not self and other are equal :rtype: bool """ - if self._parameters != other._parameters \ - or self._label_mapping != other._label_mapping: + if ( + self._parameters != other._parameters + or self._label_mapping != other._label_mapping + ): return False return True @@ -92,12 +91,11 @@ def _validate_parameters(self, parameters): :return: None """ errors = [] - list_of_necessary_params = ['model_path', 'default_label', - 'pad_label'] + list_of_necessary_params = ["model_path", "default_label", "pad_label"] # Make sure the necessary parameters are present and valid. for param in parameters: - if param in ['default_label', 'model_path', 'pad_label']: + if param in ["default_label", "model_path", "pad_label"]: if not isinstance(parameters[param], str): error = str(param) + " must be a string." errors.append(error) @@ -107,7 +105,7 @@ def _validate_parameters(self, parameters): if param not in list_of_necessary_params: errors.append(param + " is not an accepted parameter.") if errors: - raise ValueError('\n'.join(errors)) + raise ValueError("\n".join(errors)) def set_label_mapping(self, label_mapping): """ @@ -118,22 +116,24 @@ def set_label_mapping(self, label_mapping): :return: None """ if not isinstance(label_mapping, (list, dict)): - raise TypeError("Labels must either be a non-empty encoding dict " - "which maps labels to index encodings or a list.") + raise TypeError( + "Labels must either be a non-empty encoding dict " + "which maps labels to index encodings or a list." + ) label_mapping = copy.deepcopy(label_mapping) - if 'PAD' not in label_mapping: + if "PAD" not in label_mapping: if isinstance(label_mapping, list): # if list missing PAD - label_mapping = ['PAD'] + label_mapping + label_mapping = ["PAD"] + label_mapping elif 0 not in label_mapping.values(): # if dict missing PAD and 0 - label_mapping.update({'PAD': 0}) + label_mapping.update({"PAD": 0}) else: - label_mapping.update( - {'PAD': max(list(label_mapping.values())) + 1}) - if self._parameters['default_label'] not in label_mapping: - raise ValueError("The `default_label` of {} must exist in the " - "label mapping.".format( - self._parameters['default_label'])) + label_mapping.update({"PAD": max(list(label_mapping.values())) + 1}) + if self._parameters["default_label"] not in label_mapping: + raise ValueError( + "The `default_label` of {} must exist in the " + "label mapping.".format(self._parameters["default_label"]) + ) super().set_label_mapping(label_mapping) def _need_to_reconstruct_model(self): @@ -144,9 +144,11 @@ def _need_to_reconstruct_model(self): """ if not self._model: return False - default_ind = self.label_mapping[self._parameters['default_label']] - return self.num_labels != self._model_num_labels or \ - default_ind != self._model_default_ind + default_ind = self.label_mapping[self._parameters["default_label"]] + return ( + self.num_labels != self._model_num_labels + or default_ind != self._model_default_ind + ) def save_to_disk(self, dirpath): """ @@ -164,10 +166,10 @@ def save_to_disk(self, dirpath): model_param_dirpath = os.path.join(dirpath, "model_parameters.json") model_parameters = self._parameters.copy() model_parameters.pop("model_path") - with open(model_param_dirpath, 'w') as fp: + with open(model_param_dirpath, "w") as fp: json.dump(model_parameters, fp) labels_dirpath = os.path.join(dirpath, "label_mapping.json") - with open(labels_dirpath, 'w') as fp: + with open(labels_dirpath, "w") as fp: json.dump(self.label_mapping, fp) self._model.save(dirpath) @@ -183,19 +185,19 @@ def load_from_disk(cls, dirpath): # load parameters model_param_dirpath = os.path.join(dirpath, "model_parameters.json") - with open(model_param_dirpath, 'r') as fp: + with open(model_param_dirpath, "r") as fp: parameters = json.load(fp) # load label_mapping labels_dirpath = os.path.join(dirpath, "label_mapping.json") - with open(labels_dirpath, 'r') as fp: + with open(labels_dirpath, "r") as fp: label_mapping = json.load(fp) # use f1 score metric custom_objects = { "F1Score": labeler_utils.F1Score( - num_classes=max(label_mapping.values()) + 1, - average='micro'), + num_classes=max(label_mapping.values()) + 1, average="micro" + ), "CharLoadTFModel": cls, } with tf.keras.utils.custom_object_scope(custom_objects): @@ -207,7 +209,7 @@ def load_from_disk(cls, dirpath): # load self loaded_model._model_num_labels = loaded_model.num_labels loaded_model._model_default_ind = loaded_model.label_mapping[ - loaded_model._parameters['default_label'] + loaded_model._parameters["default_label"] ] return loaded_model @@ -219,37 +221,38 @@ def _construct_model(self): :return: None """ num_labels = self.num_labels - default_ind = self.label_mapping[self._parameters['default_label']] - model_loc = self._parameters['model_path'] + default_ind = self.label_mapping[self._parameters["default_label"]] + model_loc = self._parameters["model_path"] self._model = tf.keras.models.load_model(model_loc) - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] softmax_layer_ind = labeler_utils.get_tf_layer_index_from_name( - self._model, softmax_output_layer_name) + self._model, softmax_output_layer_name + ) softmax_layer = self._model.get_layer(softmax_output_layer_name) prev_softmax_layer = softmax_layer.input new_softmax_layer = softmax_layer.output if softmax_layer.weights[0].shape[-1] != num_labels: new_softmax_layer = tf.keras.layers.Dense( - num_labels, activation='softmax', name="softmax_output")( - self._model.layers[softmax_layer_ind - 1].output) + num_labels, activation="softmax", name="softmax_output" + )(self._model.layers[softmax_layer_ind - 1].output) # Output the model into a .pb file for TensorFlow argmax_layer = tf.keras.backend.argmax(new_softmax_layer) - argmax_outputs = [new_softmax_layer, argmax_layer] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) # Compile the model w/ metrics - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average='micro') - metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + num_classes=num_labels, average="micro" + ) + metrics = {softmax_output_layer_name: ["acc", f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -277,7 +280,7 @@ def _reconstruct_model(self): tf.keras.backend.clear_session() num_labels = self.num_labels - default_ind = self.label_mapping[self._parameters['default_label']] + default_ind = self.label_mapping[self._parameters["default_label"]] # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax') for _ in range(2): @@ -285,24 +288,24 @@ def _reconstruct_model(self): # Add the final Softmax layer to the previous spot final_softmax_layer = tf.keras.layers.Dense( - num_labels, activation='softmax', name="softmax_output")( - self._model.layers[-4].output) + num_labels, activation="softmax", name="softmax_output" + )(self._model.layers[-4].output) # Output the model into a .pb file for TensorFlow argmax_layer = tf.keras.backend.argmax(final_softmax_layer) - argmax_outputs = [final_softmax_layer, argmax_layer] self._model = tf.keras.Model(self._model.inputs, argmax_outputs) # Compile the model - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] losses = {softmax_output_layer_name: "categorical_crossentropy"} # use f1 score metric f1_score_training = labeler_utils.F1Score( - num_classes=num_labels, average='micro') - metrics = {softmax_output_layer_name: ['acc', f1_score_training]} + num_classes=num_labels, average="micro" + ) + metrics = {softmax_output_layer_name: ["acc", f1_score_training]} self._model.compile(loss=losses, optimizer="adam", metrics=metrics) @@ -310,8 +313,15 @@ def _reconstruct_model(self): self._model_num_labels = num_labels self._model_default_ind = default_ind - def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, - reset_weights=False, verbose=True): + def fit( + self, + train_data, + val_data=None, + batch_size=32, + label_mapping=None, + reset_weights=False, + verbose=True, + ): """ Train the current model with the training data and validation data @@ -347,19 +357,20 @@ def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, f1_report = [] self._model.reset_metrics() - softmax_output_layer_name = self._model.outputs[0].name.split('/')[0] + softmax_output_layer_name = self._model.outputs[0].name.split("/")[0] start_time = time.time() batch_id = 0 for x_train, y_train in train_data: model_results = self._model.train_on_batch( - x_train, {softmax_output_layer_name: y_train}) + x_train, {softmax_output_layer_name: y_train} + ) sys.stdout.flush() if verbose: sys.stdout.write( "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - " - "f1_score %f" % - (self._epoch_id, batch_id, *model_results[1:])) + "f1_score %f" % (self._epoch_id, batch_id, *model_results[1:]) + ) batch_id += 1 for i, metric_label in enumerate(self._model.metrics_names): @@ -367,26 +378,34 @@ def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None, if val_data: f1, f1_report = self._validate_training(val_data) - history['f1_report'] = f1_report - - val_f1 = f1_report['weighted avg']['f1-score'] \ - if f1_report else np.NAN - val_precision = f1_report['weighted avg']['precision'] \ - if f1_report else np.NAN - val_recall = f1_report['weighted avg']['recall'] \ - if f1_report else np.NAN + history["f1_report"] = f1_report + + val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN + val_precision = ( + f1_report["weighted avg"]["precision"] if f1_report else np.NAN + ) + val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN epoch_time = time.time() - start_time - logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " - "val_f1: %f - val_precision: %f - val_recall %f" % - (self._epoch_id, epoch_time, *model_results[1:], - val_f1, val_precision, val_recall)) + logger.info( + "\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- " + "val_f1: %f - val_precision: %f - val_recall %f" + % ( + self._epoch_id, + epoch_time, + *model_results[1:], + val_f1, + val_precision, + val_recall, + ) + ) self._epoch_id += 1 return history, f1, f1_report - def _validate_training(self, val_data, batch_size_test=32, - verbose_log=True, verbose_keras=False): + def _validate_training( + self, val_data, batch_size_test=32, verbose_log=True, verbose_keras=False + ): """ Validate the model on the test set and return the evaluation metrics. @@ -413,28 +432,32 @@ def _validate_training(self, val_data, batch_size_test=32, y_val_pred = [] y_val_test = [] for x_val, y_val in val_data: - y_val_pred.append(self._model.predict( - x_val, batch_size=batch_size_test, verbose=verbose_keras)[1]) + y_val_pred.append( + self._model.predict( + x_val, batch_size=batch_size_test, verbose=verbose_keras + )[1] + ) y_val_test.append(np.argmax(y_val, axis=-1)) batch_id += 1 sys.stdout.flush() if verbose_log: - sys.stdout.write("\rEPOCH %g, validation_batch_id %d" % - (self._epoch_id, batch_id)) + sys.stdout.write( + "\rEPOCH %g, validation_batch_id %d" % (self._epoch_id, batch_id) + ) - tf.keras.backend.set_floatx('float32') + tf.keras.backend.set_floatx("float32") # Clean the predicted entities and the actual entities f1, f1_report = labeler_utils.evaluate_accuracy( np.concatenate(y_val_pred, axis=0), np.concatenate(y_val_test, axis=0), self.num_labels, self.reverse_label_mapping, - verbose=verbose_keras) + verbose=verbose_keras, + ) return f1, f1_report - def predict(self, data, batch_size=32, show_confidences=False, - verbose=True): + def predict(self, data, batch_size=32, show_confidences=False, verbose=True): """ Run model and get predictions @@ -452,10 +475,12 @@ def predict(self, data, batch_size=32, show_confidences=False, if not self._model: self._construct_model() elif self._need_to_reconstruct_model(): - raise RuntimeError("The model label mapping definitions have been " - "altered without additional training. Please " - "train the model or reset the label mapping to " - "predict.") + raise RuntimeError( + "The model label mapping definitions have been " + "altered without additional training. Please " + "train the model or reset the label mapping to " + "predict." + ) # Pre-allocate space for predictions confidences = [] predictions = [] @@ -463,9 +488,7 @@ def predict(self, data, batch_size=32, show_confidences=False, # Run model with batching allocation_index = 0 for batch_id, batch_data in enumerate(data): - model_output = self._model( - tf.convert_to_tensor(batch_data) - ) + model_output = self._model(tf.convert_to_tensor(batch_data)) # Count number of samples in batch to prevent array mismatch num_samples_in_batch = len(batch_data) @@ -477,8 +500,12 @@ def predict(self, data, batch_size=32, show_confidences=False, confidences += confidences if show_confidences: - confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy() - predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy() + confidences[ + allocation_index : allocation_index + num_samples_in_batch + ] = model_output[0].numpy() + predictions[ + allocation_index : allocation_index + num_samples_in_batch + ] = model_output[1].numpy() allocation_index += num_samples_in_batch @@ -486,12 +513,11 @@ def predict(self, data, batch_size=32, show_confidences=False, predictions = [predictions[i].tolist() for i in range(allocation_index)] confidences_list = None if show_confidences: - confidences = [confidences[i].tolist() - for i in range(0, allocation_index)] + confidences = [confidences[i].tolist() for i in range(0, allocation_index)] if show_confidences: - return {'pred': predictions, 'conf': confidences} - return {'pred': predictions} + return {"pred": predictions, "conf": confidences} + return {"pred": predictions} def details(self): """ From 032ad8e192f8b704cc1835af4fee1c04acedf4e9 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 13 Jul 2022 09:37:33 -0500 Subject: [PATCH 13/14] fix: reformat tests for isort / black --- .../test_char_load_tf_data_labeler.py | 121 +++++++++--------- .../tests/labelers/test_char_tf_load_model.py | 35 +++-- 2 files changed, 73 insertions(+), 83 deletions(-) diff --git a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py index 2f8237db5..a5bcaf6da 100644 --- a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py +++ b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py @@ -4,56 +4,45 @@ from io import StringIO from unittest import mock -from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, \ - data_processing -from dataprofiler.labelers.char_load_tf_model import \ - CharLoadTFModel - +from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, data_processing +from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) data_labeler_parameters = { - 'model': { - 'class': 'CharLoadTFModel', - 'parameters': {} - }, - 'label_mapping': { - 'PAD': 0, - 'CITY': 1, # SAME AS UNKNOWN - 'UNKNOWN': 1, - 'ADDRESS': 2, - 'PERSON': 3, - }, - 'preprocessor': { - 'class': 'CharEncodedPreprocessor' - }, - 'postprocessor': { - 'class': 'CharPostprocessor' + "model": {"class": "CharLoadTFModel", "parameters": {}}, + "label_mapping": { + "PAD": 0, + "CITY": 1, # SAME AS UNKNOWN + "UNKNOWN": 1, + "ADDRESS": 2, + "PERSON": 3, }, + "preprocessor": {"class": "CharEncodedPreprocessor"}, + "postprocessor": {"class": "CharPostprocessor"}, } preprocessor_parameters = { - 'encoding_map': {'t': 1, 's': 2}, - 'flatten_split': 0, - 'flatten_separator': ' ', - 'is_separate_at_max_len': True, - + "encoding_map": {"t": 1, "s": 2}, + "flatten_split": 0, + "flatten_separator": " ", + "is_separate_at_max_len": True, } postprocessor_parameters = { - 'use_word_level_argmax': True, - 'output_format': 'character_argmax', - 'separators': (' ', ',', ';', "'", '"', ':', '\n', '\t', "."), - 'word_level_min_percent': 0.75, + "use_word_level_argmax": True, + "output_format": "character_argmax", + "separators": (" ", ",", ";", "'", '"', ":", "\n", "\t", "."), + "word_level_min_percent": 0.75, } def mock_open(filename, *args): - if filename.find('data_labeler_parameters') >= 0: + if filename.find("data_labeler_parameters") >= 0: return StringIO(json.dumps(data_labeler_parameters)) - elif filename.find('preprocessor_parameters') >= 0: + elif filename.find("preprocessor_parameters") >= 0: return StringIO(json.dumps(preprocessor_parameters)) - elif filename.find('postprocessor_parameters') >= 0: + elif filename.find("postprocessor_parameters") >= 0: return StringIO(json.dumps(postprocessor_parameters)) @@ -64,68 +53,71 @@ def setup_save_mock_open(mock_open): return mock_file -@mock.patch('dataprofiler.labelers.data_processing.BaseDataProcessor') -@mock.patch('dataprofiler.labelers.char_load_tf_model.' - 'CharLoadTFModel.load_from_disk') +@mock.patch("dataprofiler.labelers.data_processing.BaseDataProcessor") +@mock.patch( + "dataprofiler.labelers.char_load_tf_model." "CharLoadTFModel.load_from_disk" +) @mock.patch("builtins.open", side_effect=mock_open) class TestCharTFLoadDataLabeler(unittest.TestCase): - @staticmethod def _setup_mock_load_model(mock_load_model): model_mock = mock.Mock(spec=CharLoadTFModel) model_mock.set_num_labels = mock.Mock() mock_load_model.return_value = model_mock model_mock.requires_zero_mapping = True - model_mock.labels = ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON'] + model_mock.labels = ["PAD", "UNKNOWN", "ADDRESS", "PERSON"] model_mock.label_mapping = { - 'PAD': 0, - 'CITY': 1, # SAME AS UNKNOWN - 'UNKNOWN': 1, - 'ADDRESS': 2, - 'PERSON': 3, + "PAD": 0, + "CITY": 1, # SAME AS UNKNOWN + "UNKNOWN": 1, + "ADDRESS": 2, + "PERSON": 3, } model_mock.reverse_label_mapping = { - 0: 'PAD', - 1: 'UNKNOWN', - 2: 'ADDRESS', - 3: 'PERSON', + 0: "PAD", + 1: "UNKNOWN", + 2: "ADDRESS", + 3: "PERSON", } @staticmethod def _setup_mock_load_processor(mock_base_processor): def side_effect(arg): processor = { - 'CharEncodedPreprocessor': mock.Mock( - spec=data_processing.CharEncodedPreprocessor), - 'CharPostprocessor': mock.Mock( - spec=data_processing.CharPostprocessor), + "CharEncodedPreprocessor": mock.Mock( + spec=data_processing.CharEncodedPreprocessor + ), + "CharPostprocessor": mock.Mock(spec=data_processing.CharPostprocessor), }[arg] processor.load_from_disk.return_value = processor return processor mock_base_processor.get_class.side_effect = side_effect - def test_load_from_disk(self, mock_open, mock_load_model, - mock_base_processor): + def test_load_from_disk(self, mock_open, mock_load_model, mock_base_processor): self._setup_mock_load_model(mock_load_model) self._setup_mock_load_processor(mock_base_processor) # load default - data_labeler = DataLabeler.load_from_disk('fake/path') + data_labeler = DataLabeler.load_from_disk("fake/path") - self.assertDictEqual(data_labeler.label_mapping, - data_labeler_parameters['label_mapping']) + self.assertDictEqual( + data_labeler.label_mapping, data_labeler_parameters["label_mapping"] + ) self.assertListEqual( - data_labeler.labels, - ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON']) + data_labeler.labels, ["PAD", "UNKNOWN", "ADDRESS", "PERSON"] + ) self.assertIsInstance( - data_labeler.preprocessor, data_processing.BaseDataPreprocessor) + data_labeler.preprocessor, data_processing.BaseDataPreprocessor + ) self.assertIsInstance( - data_labeler.postprocessor, data_processing.BaseDataPostprocessor) + data_labeler.postprocessor, data_processing.BaseDataPostprocessor + ) - def test_save_to_disk(self, mock_open, mock_load_model, - mock_load_processor, *mocks): + def test_save_to_disk( + self, mock_open, mock_load_model, mock_load_processor, *mocks + ): self._setup_mock_load_model(mock_load_model) self._setup_mock_load_processor(mock_load_processor) @@ -137,12 +129,13 @@ def test_save_to_disk(self, mock_open, mock_load_model, mock_file = setup_save_mock_open(mock_open) # save and test - data_labeler.save_to_disk('test/path') + data_labeler.save_to_disk("test/path") self.assertEqual( '{"model": {"class": "CharLoadTFModel"}, ' '"preprocessor": {"class": "CharEncodedPreprocessor"}, ' '"postprocessor": {"class": "CharPostprocessor"}}', - mock_file.getvalue()) + mock_file.getvalue(), + ) # close mock StringIO.close(mock_file) diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py index 8f06af6f2..fbfde0c49 100644 --- a/dataprofiler/tests/labelers/test_char_tf_load_model.py +++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py @@ -32,11 +32,14 @@ def mock_tf_model(*args, **kwargs): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.int64)) - model.add(tf.keras.layers.Embedding( - input_dim=100, - output_dim=30, - embeddings_initializer="normal", - trainable=True)) + model.add( + tf.keras.layers.Embedding( + input_dim=100, + output_dim=30, + embeddings_initializer="normal", + trainable=True, + ) + ) model.add(tf.keras.layers.Dense(units=10, activation="relu")) model.add(tf.keras.layers.Dense(10, activation="softmax")) return model @@ -56,7 +59,7 @@ def setup_save_mock_open(mock_open): return mock_file -@mock.patch('tensorflow.keras.models.load_model', side_effect=mock_tf_model) +@mock.patch("tensorflow.keras.models.load_model", side_effect=mock_tf_model) class TestCharLoadTFModel(unittest.TestCase): @classmethod def setUpClass(cls): @@ -121,7 +124,7 @@ def test_init(self, *mocks): ] self.assertDictEqual(self.label_mapping, model.label_mapping) - self.assertEqual(self.model_path, model._parameters['model_path']) + self.assertEqual(self.model_path, model._parameters["model_path"]) self.assertListEqual(expected_labels, model.labels) def test_reverse_label_mapping(self, *mocks): @@ -151,8 +154,7 @@ def test_reverse_label_mapping(self, *mocks): } self.assertDictEqual( - expected_reverse_label_mapping, - model.reverse_label_mapping + expected_reverse_label_mapping, model.reverse_label_mapping ) def test_set_label_mapping(self, *mocks): @@ -219,15 +221,12 @@ def test_predict(self, *mocks): data_gen = [np.array([[1, 3], [1, 2]])] result = model.predict(data_gen) self.assertIn("pred", result) - self.assertEqual((2, 2), np.array(result['pred']).shape) + self.assertEqual((2, 2), np.array(result["pred"]).shape) result = model.predict(data_gen, show_confidences=True) self.assertIn("pred", result) self.assertIn("conf", result) - self.assertEqual( - (2, 2, model.num_labels), - np.array(result['conf']).shape - ) + self.assertEqual((2, 2, model.num_labels), np.array(result["conf"]).shape) def test_fit_and_predict(self, *mocks): # model @@ -260,7 +259,7 @@ def test_fit_and_predict(self, *mocks): "TEST": 1, "NEW": 2, "MAPPING": 3, - model._parameters['default_label']: 4, + model._parameters["default_label"]: 4, } data_gen = [ [ @@ -304,9 +303,7 @@ def test_param_validation(self, *mocks): "fake_extra_param": "fails", } model = CharLoadTFModel( - self.model_path, - label_mapping=self.label_mapping, - parameters=parameters + self.model_path, label_mapping=self.label_mapping, parameters=parameters ) model._construct_model() self.assertDictEqual(parameters, model._parameters) @@ -314,7 +311,7 @@ def test_param_validation(self, *mocks): CharLoadTFModel( self.model_path, label_mapping=self.label_mapping, - parameters=invalid_parameters + parameters=invalid_parameters, ) @mock.patch("sys.stdout", new_callable=StringIO) From 4002125a1335bbd281af484b9d8089d8f5d29556 Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Wed, 13 Jul 2022 10:14:38 -0500 Subject: [PATCH 14/14] fix: variable suggestion --- dataprofiler/labelers/char_load_tf_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py index 01e2f6bb8..5357dfb73 100644 --- a/dataprofiler/labelers/char_load_tf_model.py +++ b/dataprofiler/labelers/char_load_tf_model.py @@ -95,7 +95,7 @@ def _validate_parameters(self, parameters): # Make sure the necessary parameters are present and valid. for param in parameters: - if param in ["default_label", "model_path", "pad_label"]: + if param in list_of_necessary_params: if not isinstance(parameters[param], str): error = str(param) + " must be a string." errors.append(error)