From 832c9572b01fbe2911b4aea055f0010bd0382998 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Thu, 31 Mar 2022 16:31:47 -0500
Subject: [PATCH 01/14] feat: new model and pep8

---
 .../labelers/character_level_cnn_model.py     |  21 +-
 .../labelers/pre_encoded_char_cnn_model.py    | 839 ++++++++++++++++++
 2 files changed, 858 insertions(+), 2 deletions(-)
 create mode 100644 dataprofiler/labelers/pre_encoded_char_cnn_model.py

diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
index 8a9f164ba..4a9e84311 100644
--- a/dataprofiler/labelers/character_level_cnn_model.py
+++ b/dataprofiler/labelers/character_level_cnn_model.py
@@ -5,6 +5,7 @@
 import sys
 import time
 from collections import defaultdict
+import functools
 
 import numpy as np
 import tensorflow as tf
@@ -34,7 +35,23 @@ def filter(self, record):
 tf_logger.addFilter(NoV1ResourceMessageFilter())
 
 
-@tf.keras.utils.register_keras_serializable()
+def protected_register_keras_serializable(package='Custom', name=None):
+    """
+    Protects against already registered keras serializable layers. This
+    ensures that if it was already registered, it will not try to register it
+    again.
+    """
+    def decorator(arg):
+        """Protects against double registration of a keras layer."""
+        class_name = name if name is not None else arg.__name__
+        registered_name = package + '>' + class_name
+        if tf.keras.utils.get_registered_object(registered_name) is None:
+            tf.keras.utils.register_keras_serializable(package, name)(arg)
+        return arg
+    return decorator
+
+
+@protected_register_keras_serializable()
 class FBetaScore(tf.keras.metrics.Metric):
     r"""Computes F-Beta score.
     Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
@@ -198,7 +215,7 @@ def reset_states(self):
         tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
 
 
-@tf.keras.utils.register_keras_serializable()
+@protected_register_keras_serializable()
 class F1Score(FBetaScore):
     r"""Computes F-1 Score.
 
diff --git a/dataprofiler/labelers/pre_encoded_char_cnn_model.py b/dataprofiler/labelers/pre_encoded_char_cnn_model.py
new file mode 100644
index 000000000..a8ecb96de
--- /dev/null
+++ b/dataprofiler/labelers/pre_encoded_char_cnn_model.py
@@ -0,0 +1,839 @@
+import json
+import copy
+import os
+import sys
+import time
+import logging
+from collections import defaultdict
+import functools
+
+import tensorflow as tf
+import numpy as np
+from sklearn import decomposition
+
+from . import labeler_utils
+from .base_model import BaseModel, BaseTrainableModel
+from .base_model import AutoSubRegistrationMeta
+from .. import dp_logging
+
+_file_dir = os.path.dirname(os.path.abspath(__file__))
+
+logger = dp_logging.get_child_logger(__name__)
+
+
+class NoV1ResourceMessageFilter(logging.Filter):
+    """Removes TF2 warning for using TF1 model which has resources."""
+    def filter(self, record):
+        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
+            'running tf.compat.v1.enable_resource_variables()'
+        return msg not in record.getMessage()
+
+
+tf_logger = logging.getLogger('tensorflow')
+tf_logger.addFilter(NoV1ResourceMessageFilter())
+
+
+def protected_register_keras_serializable(package='Custom', name=None):
+    """
+    Protects against already registered keras serializable layers. This
+    ensures that if it was already registered, it will not try to register it
+    again.
+    """
+    def decorator(arg):
+        """Protects against double registration of a keras layer."""
+        class_name = name if name is not None else arg.__name__
+        registered_name = package + '>' + class_name
+        if tf.keras.utils.get_registered_object(registered_name) is None:
+            tf.keras.utils.register_keras_serializable(package, name)(arg)
+        return arg
+    return decorator
+
+
+@protected_register_keras_serializable()
+class FBetaScore(tf.keras.metrics.Metric):
+    r"""Computes F-Beta score.
+    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the weighted harmonic mean of precision
+    and recall. Output range is `[0, 1]`. Works for
+    both multi-class and multi-label classification.
+    $$
+    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro` and
+            `weighted`. Default value is None.
+        beta: Determines the weight of precision and recall
+            in harmonic mean. Determines the weight given to the
+            precision and recall. Default value is 1.
+        threshold: Elements of `y_pred` greater than threshold are
+            converted to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-Beta Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
+                 name="fbeta_score", dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype)
+
+        if average not in (None, "micro", "macro", "weighted"):
+            raise ValueError(
+                "Unknown average type. Acceptable values "
+                "are: [None, 'micro', 'macro', 'weighted']"
+            )
+
+        if not isinstance(beta, float):
+            raise TypeError("The value of beta should be a python float")
+
+        if beta <= 0.0:
+            raise ValueError("beta value should be greater than zero")
+
+        if threshold is not None:
+            if not isinstance(threshold, float):
+                raise TypeError("The value of threshold should be a python float")
+            if threshold > 1.0 or threshold <= 0.0:
+                raise ValueError("threshold should be between 0 and 1")
+
+        self.num_classes = num_classes
+        self.average = average
+        self.beta = beta
+        self.threshold = threshold
+        self.axis = None
+        self.init_shape = []
+
+        if self.average != "micro":
+            self.axis = 0
+            self.init_shape = [self.num_classes]
+
+        def _zero_wt_init(name):
+            return self.add_weight(
+                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
+            )
+
+        self.true_positives = _zero_wt_init("true_positives")
+        self.false_positives = _zero_wt_init("false_positives")
+        self.false_negatives = _zero_wt_init("false_negatives")
+        self.weights_intermediate = _zero_wt_init("weights_intermediate")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        if self.threshold is None:
+            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
+            # make sure [0, 0, 0] doesn't become [1, 1, 1]
+            # Use abs(x) > eps, instead of x != 0 to check for zero
+            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
+        else:
+            y_pred = y_pred > self.threshold
+
+        y_true = tf.cast(y_true, self.dtype)
+        y_pred = tf.cast(y_pred, self.dtype)
+
+        def _weighted_sum(val, sample_weight):
+            if sample_weight is not None:
+                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
+            return tf.reduce_sum(val, axis=self.axis)
+
+        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
+        self.false_positives.assign_add(
+            _weighted_sum(y_pred * (1 - y_true), sample_weight)
+        )
+        self.false_negatives.assign_add(
+            _weighted_sum((1 - y_pred) * y_true, sample_weight)
+        )
+        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
+
+    def result(self):
+        precision = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_positives
+        )
+        recall = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_negatives
+        )
+
+        mul_value = precision * recall
+        add_value = (tf.math.square(self.beta) * precision) + recall
+        mean = tf.math.divide_no_nan(mul_value, add_value)
+        f1_score = mean * (1 + tf.math.square(self.beta))
+
+        if self.average == "weighted":
+            weights = tf.math.divide_no_nan(
+                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
+            )
+            f1_score = tf.reduce_sum(f1_score * weights)
+
+        elif self.average is not None:  # [micro, macro]
+            f1_score = tf.reduce_mean(f1_score)
+
+        return f1_score
+
+    def get_config(self):
+        """Returns the serializable config of the metric."""
+
+        config = {
+            "num_classes": self.num_classes,
+            "average": self.average,
+            "beta": self.beta,
+            "threshold": self.threshold,
+        }
+
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def reset_states(self):
+        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
+        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
+
+
+@protected_register_keras_serializable()
+class F1Score(FBetaScore):
+    r"""Computes F-1 Score.
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the harmonic mean of precision and recall.
+    Output range is `[0, 1]`. Works for both multi-class
+    and multi-label classification.
+    $$
+    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro`
+            and `weighted`. Default value is None.
+        threshold: Elements of `y_pred` above threshold are
+            considered to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-1 Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, threshold=None,
+                 name="f1_score", dtype=None):
+        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
+
+    def get_config(self):
+        base_config = super().get_config()
+        del base_config["beta"]
+        return base_config
+
+
+class PreEncodedCharCnnModel(BaseTrainableModel,
+                             metaclass=AutoSubRegistrationMeta):
+
+    # boolean if the label mapping requires the mapping for index 0 reserved
+    requires_zero_mapping = True
+
+    def __init__(self, label_mapping=None, parameters=None):
+        """
+        CNN Model Initializer. initialize epoch_id
+
+        :param label_mapping: maps labels to their encoded integers
+        :type label_mapping: dict
+        :param parameters: Contains all the appropriate parameters for the
+            model. Must contain num_labels. Other possible parameters are:
+                max_length, max_char_encoding_id, dim_embed, size_fc
+                dropout, size_conv, num_fil, optimizer, default_label
+        :type parameters: dict
+        :return: None
+        """
+
+        # parameter initialization
+        if not parameters:
+            parameters = {}
+        parameters.setdefault('max_length', 1014)
+        parameters.setdefault('alphabet_size', 69)
+        parameters.setdefault('dim_embed', 32)
+        parameters.setdefault('conv_layers', [
+          [256, 7,  1],
+          [256, 7,  1],
+          [256, 3, -1],
+          [256, 3, -1],
+          [256, 3, -1],
+          [256, 3,  1]
+        ])
+        parameters.setdefault('size_fc', [512, 512])
+        parameters.setdefault('dropout', 0.5)
+        parameters.setdefault('threshold', 1e-6)
+        parameters.setdefault('default_label', "UNKNOWN")
+        parameters['pad_label'] = 'PAD'
+        self._epoch_id = 0
+
+        # reconstruct flags for model
+        self._model_num_labels = 0
+        self._model_default_ind = -1
+
+        BaseModel.__init__(self, label_mapping, parameters)
+
+    def __eq__(self, other):
+        """
+        Checks if two models are equal with one another, may only check
+        important variables, i.e. may not check model itself.
+
+        :param self: a model
+        :param other: a model
+        :type self: BaseModel
+        :type other: BaseModel
+        :return: Whether or not self and other are equal
+        :rtype: bool
+        """
+        if self._parameters != other._parameters \
+                or self._label_mapping != other._label_mapping:
+            return False
+        return True
+
+    def _validate_parameters(self, parameters):
+        """
+        Validate the parameters sent in. Raise error if invalid parameters are
+        present.
+
+        :param parameters: parameter dict containing the following parameters:
+            max_length: Maximum char length in a sample
+            max_char_encoding_id: Maximum integer value for encoding the input
+            dim_embed: Number of embedded dimensions
+            size_fc: Size of each fully connected layers
+            dropout: Ratio of dropout in the model
+            size_conv: Convolution kernel size
+            default_label: Key for label_mapping that is the default label
+            pad_label: Key for entities_dict that is the pad label
+            num_fil: Number of filters in each convolution layer
+        :type parameters: dict
+        :return: None
+        """
+        errors = []
+        list_of_necessary_params = ['max_length', 'alphabet_size',
+                                    'dim_embed', 'size_fc', 'dropout',
+                                    'threshold', 'conv_layers', 'default_label',
+                                     'pad_label']
+        # Make sure the necessary parameters are present and valid.
+        for param in parameters:
+            if param in ['max_length', 'alphabet_size', 'dim_embed']:
+                if not isinstance(parameters[param], (int, float)) \
+                        or parameters[param] < 0:
+                    errors.append(param + " must be a valid integer or float "
+                                          "greater than 0.")
+            elif param in ['dropout', 'threshold']:
+                if not isinstance(parameters[param], (int, float)) \
+                        or parameters[param] < 0 or parameters[param] > 1:
+                    errors.append(param + " must be a valid integer or float "
+                                          "from 0 to 1.")
+            elif param == 'size_fc':
+                if not isinstance(parameters[param], list) \
+                        or len(parameters[param]) == 0:
+                    errors.append(param + " must be a non-empty list of "
+                                          "integers.")
+                else:
+                    for item in parameters[param]:
+                        if not isinstance(item, int):
+                            errors.append(param + " must be a non-empty "
+                                                  "list of integers.")
+                            break
+            elif param == 'conv_layers':
+                is_bad_conv_layers = True
+                if isinstance(parameters[param], list):
+                    is_bad_conv_layers = False
+                    for layer in parameters[param]:
+                        if (not isinstance(layer, list) or len(layer) != 3
+                                or any([not isinstance(x, int) for x in  layer])):
+                            is_bad_conv_layers = True
+                if is_bad_conv_layers:
+                    errors.append(param + " must be a non-empty list of "
+                                          "tuples containing 3 integers.")
+            elif param == 'default_label':
+                if not isinstance(parameters[param], str):
+                    error = str(param) + " must be a string."
+                    errors.append(error)
+
+        # Error if there are extra parameters thrown in
+        for param in parameters:
+            if param not in list_of_necessary_params:
+                errors.append(param + " is not an accepted parameter.")
+        if errors:
+            raise ValueError('\n'.join(errors))
+
+    def set_label_mapping(self, label_mapping):
+        """
+        Sets the labels for the model
+
+        :param label_mapping: label mapping of the model
+        :type label_mapping: dict
+        :return: None
+        """
+        if not isinstance(label_mapping, (list, dict)):
+            raise TypeError("Labels must either be a non-empty encoding dict "
+                            "which maps labels to index encodings or a list.")
+
+        label_mapping = copy.deepcopy(label_mapping)
+        if 'PAD' not in label_mapping:
+            if isinstance(label_mapping, list):  # if list missing PAD
+                label_mapping = ['PAD'] + label_mapping
+            elif 0 not in label_mapping.values():  # if dict missing PAD and 0
+                label_mapping.update({'PAD': 0})
+        if (isinstance(label_mapping, dict)
+                and label_mapping.get('PAD', None) != 0):  # dict with bad PAD
+            raise ValueError("`PAD` must map to index zero.")
+        if self._parameters['default_label'] not in label_mapping:
+            raise ValueError("The `default_label` of {} must exist in the "
+                             "label mapping.".format(
+                                self._parameters['default_label']))
+        super().set_label_mapping(label_mapping)
+
+    def _need_to_reconstruct_model(self):
+        """
+        Determines whether or not the model needs to be reconstructed.
+
+        :return: bool of whether or not the model needs to reconstruct.
+        """
+        if not self._model:
+            return False
+        default_ind = self.label_mapping[self._parameters['default_label']]
+        return self.num_labels != self._model_num_labels or \
+            default_ind != self._model_default_ind
+
+    def save_to_disk(self, dirpath):
+        """
+        Saves whole model to disk with weights
+
+        :param dirpath: directory path where you want to save the model to
+        :type dirpath: str
+        :return: None
+        """
+        if not self._model:
+            self._construct_model()
+        elif self._need_to_reconstruct_model():
+            self._reconstruct_model()
+
+        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
+        with open(model_param_dirpath, 'w') as fp:
+            json.dump(self._parameters, fp)
+        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
+        with open(labels_dirpath, 'w') as fp:
+            json.dump(self.label_mapping, fp)
+        self._model.save(os.path.join(dirpath))
+
+    @classmethod
+    def load_from_disk(cls, dirpath):
+        """
+        Loads whole model from disk with weights
+
+        :param dirpath: directory path where you want to load the model from
+        :type dirpath: str
+        :return: None
+        """
+
+        # load parameters
+        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
+        with open(model_param_dirpath, 'r') as fp:
+            parameters = json.load(fp)
+
+        # load label_mapping
+        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
+        with open(labels_dirpath, 'r') as fp:
+            label_mapping = json.load(fp)
+
+        # use f1 score metric
+        custom_objects = {
+            "F1Score": F1Score(
+                num_classes=max(label_mapping.values()) + 1,
+                average='micro'),
+            "CharacterLevelCnnModel": cls,
+        }
+        with tf.keras.utils.custom_object_scope(custom_objects):
+            tf_model = tf.keras.models.load_model(dirpath)
+
+        loaded_model = cls(label_mapping, parameters)
+        loaded_model._model = tf_model
+        #
+        # # Tensorflow v1 Model weights need to be transferred.
+        # if not callable(tf_model):
+        #     loaded_model._construct_model()
+        #     tf1_weights = []
+        #     for var in tf_model.variables:
+        #         if 'training' not in var.name:
+        #             tf1_weights.append(var.value())
+        #
+        #     loaded_model._construct_model()
+        #     tf1_weights.append(loaded_model._model.weights[-1].value())
+        #     loaded_model._model.set_weights(tf1_weights)
+
+        # load self
+        loaded_model._model_num_labels = loaded_model.num_labels
+        loaded_model._model_default_ind = loaded_model.label_mapping[
+            loaded_model._parameters['default_label']
+        ]
+        return loaded_model
+
+    def _construct_model(self):
+        """
+        Model constructor for the data labeler. This also serves as a weight
+        reset.
+
+        :return: None
+        """
+        num_labels = self.num_labels
+        default_ind = self.label_mapping[self._parameters['default_label']]
+
+        # default parameters
+        max_length = self._parameters['max_length']
+        alphabet_size = self._parameters['alphabet_size']
+        dim_embed = self._parameters['dim_embed']
+        conv_layers = self._parameters['conv_layers']
+        size_fc = self._parameters['size_fc']
+        threshold = self._parameters['threshold']
+        dropout = self._parameters['dropout']
+
+        # Reset model
+        tf.keras.backend.clear_session()
+
+         # Input layer
+        inputs = tf.keras.layers.Input(
+            shape=(None,), name='sent_input', dtype='int64')
+        # Embedding layers
+        x_embedding = tf.keras.layers.Embedding(
+            alphabet_size + 1, dim_embed, input_length=max_length)(inputs)
+
+        # Convolution layers
+        x = x_embedding
+        for cl in conv_layers:
+            x = tf.keras.layers.Convolution1D(cl[0], cl[1], padding='same')(x)
+            x = tf.keras.layers.ThresholdedReLU(threshold)(x)
+            if cl[2] != -1:
+                x = tf.keras.layers.MaxPooling1D(cl[2])(x)
+        # x = tf.keras.layers.Flatten()(x)
+
+        # Fully connected layers
+        for fl in size_fc:
+            x_dense = tf.keras.layers.Dense(fl)(x)
+            x = tf.keras.layers.ThresholdedReLU(threshold)(x_dense)
+            x = tf.keras.layers.Dropout(dropout)(x)
+
+        # Output layer
+        predictions = tf.keras.layers.Dense(
+            num_labels, activation='softmax', name='softmax_output')(x)
+        # argmax layer
+        argmax_layer = tf.keras.backend.argmax(predictions)
+
+        # Build and compile model
+        self._model = tf.keras.models.Model(
+            inputs=inputs, outputs=[predictions, argmax_layer])
+
+        # Compile the model w/ metrics
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+
+        # use f1 score metric
+        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+
+        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+
+        self._epoch_id = 0
+        self._model_num_labels = num_labels
+        self._model_default_ind = default_ind
+
+    def reset_weights(self):
+        """
+        Reset the weights of the model.
+
+        :return: None
+        """
+        self._construct_model()
+
+    def _reconstruct_model(self):
+        """
+        Reconstruct the appropriate layers if the number of number of labels is
+        altered
+
+        :return: None
+        """
+
+        # Reset model
+        tf.keras.backend.clear_session()
+
+        num_labels = self.num_labels
+        default_ind = self.label_mapping[self._parameters['default_label']]
+
+        # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax')
+        for _ in range(2):
+            self._model.layers.pop()
+
+        # Add the final Softmax layer to the previous spot
+        final_softmax_layer = tf.keras.layers.Dense(
+            num_labels, activation='softmax', name="softmax_output")(
+            self._model.layers[-4].output)
+
+        # Output the model into a .pb file for TensorFlow
+        argmax_layer = tf.keras.backend.argmax(final_softmax_layer)
+
+
+        argmax_outputs = [final_softmax_layer, argmax_layer]
+        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
+
+        # Compile the model
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+
+        # use f1 score metric
+        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+
+        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+
+        self._epoch_id = 0
+        self._model_num_labels = num_labels
+        self._model_default_ind = default_ind
+
+    def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
+            reset_weights=False, verbose=True):
+        """
+        Train the current model with the training data and validation data
+
+        :param train_data: Training data used to train model
+        :type train_data: Union[list, np.ndarray]
+        :param val_data: Validation data used to validate the training
+        :type val_data: Union[list, np.ndarray]
+        :param batch_size: Used to determine number of samples in each batch
+        :type batch_size: int
+        :param label_mapping: maps labels to their encoded integers
+        :type label_mapping: Union[dict, None]
+        :param reset_weights: Flag to determine whether to reset the weights or
+            not
+        :type reset_weights: bool
+        :param verbose: Flag to determine whether to print status or not
+        :type verbose: bool
+        :return: None
+        """
+
+        if label_mapping is not None:
+            self.set_label_mapping(label_mapping)
+
+        if not self._model:
+            self._construct_model()
+        else:
+            if self._need_to_reconstruct_model():
+                self._reconstruct_model()
+            if reset_weights:
+                self.reset_weights()
+
+        history = defaultdict()
+        f1 = None
+        f1_report = []
+
+        self._model.reset_metrics()
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+
+        start_time = time.time()
+        batch_id = 0
+        for x_train, y_train in train_data:
+            model_results = self._model.train_on_batch(
+                x_train, {softmax_output_layer_name: y_train})
+            sys.stdout.flush()
+            if verbose:
+                sys.stdout.write(
+                    "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - "
+                    "f1_score %f" %
+                    (self._epoch_id, batch_id, *model_results[1:]))
+            batch_id += 1
+
+        for i, metric_label in enumerate(self._model.metrics_names):
+            history[metric_label] = model_results[i]
+
+        if val_data:
+            f1, f1_report = self._validate_training(val_data)
+            history['f1_report'] = f1_report
+
+            val_f1 = f1_report['weighted avg']['f1-score'] \
+                if f1_report else np.NAN
+            val_precision = f1_report['weighted avg']['precision'] \
+                if f1_report else np.NAN
+            val_recall = f1_report['weighted avg']['recall'] \
+                if f1_report else np.NAN
+            epoch_time = time.time() - start_time
+            logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
+                        "val_f1: %f - val_precision: %f - val_recall %f" %
+                        (self._epoch_id, epoch_time, *model_results[1:],
+                         val_f1, val_precision, val_recall))
+
+        self._epoch_id += 1
+
+        return history, f1, f1_report
+
+    def _validate_training(self, val_data, batch_size_test=32,
+                           verbose_log=True, verbose_keras=False):
+        """
+        Validate the model on the test set and return the evaluation metrics.
+
+        :param val_data: data generator for the validation
+        :type val_data: iterator
+        :param batch_size_test: Number of samples to process in testing
+        :type batch_size_test: int
+        :param verbose_log: whether or not to print out scores for training,
+            etc.
+        :type verbose_log: bool
+        :param verbose_keras: whether or not to print out scores for training,
+            from keras.
+        :type verbose_keras: bool
+        return (f1-score, f1 report).
+        """
+        f1 = None
+        f1_report = None
+
+        if val_data is None:
+            return f1, f1_report
+
+        # Predict on the test set
+        batch_id = 0
+        y_val_pred = []
+        y_val_test = []
+        for x_val, y_val in val_data:
+            y_val_pred.append(self._model.predict(
+                x_val, batch_size=batch_size_test, verbose=verbose_keras)[1])
+            y_val_test.append(np.argmax(y_val, axis=-1))
+            batch_id += 1
+            sys.stdout.flush()
+            if verbose_log:
+                sys.stdout.write("\rEPOCH %g, validation_batch_id %d" %
+                                 (self._epoch_id, batch_id))
+
+        tf.keras.backend.set_floatx('float32')
+        # Clean the predicted entities and the actual entities
+        f1, f1_report = labeler_utils.evaluate_accuracy(
+            np.concatenate(y_val_pred, axis=0),
+            np.concatenate(y_val_test, axis=0),
+            self.num_labels,
+            self.reverse_label_mapping,
+            verbose=verbose_keras)
+
+        return f1, f1_report
+
+    def predict(self, data, batch_size=32, show_confidences=False,
+                verbose=True):
+        """
+        Run model and get predictions
+
+        :param data: text input
+        :type data: Union[list, numpy.ndarray]
+        :param batch_size: number of samples in the batch of data
+        :type batch_size: int
+        :param show_confidences: whether user wants prediction confidences
+        :type show_confidences:
+        :param verbose: Flag to determine whether to print status or not
+        :type verbose: bool
+        :return: char level predictions and confidences
+        :rtype: dict
+        """
+        if not self._model:
+            raise ValueError("You are trying to predict without a model. "
+                             "Construct/Load a model before predicting.")
+        elif self._need_to_reconstruct_model():
+            raise RuntimeError("The model label mapping definitions have been "
+                               "altered without additional training. Please "
+                               "train the model or reset the label mapping to "
+                               "predict.")
+        # Pre-allocate space for predictions
+        confidences = []
+        sentence_lengths = np.zeros((batch_size,), dtype=int)
+        predictions = np.zeros((batch_size, self._parameters['max_length']))
+        if show_confidences:
+            confidences = np.zeros((batch_size,
+                                    self._parameters['max_length'],
+                                    self.num_labels))
+
+        # Run model with batching
+        allocation_index = 0
+        for batch_id, batch_data in enumerate(data):
+            model_output = self._model(
+                tf.convert_to_tensor(batch_data)
+            )
+
+            # Count number of samples in batch to prevent array mismatch
+            num_samples_in_batch = len(batch_data)
+            allocation_index = batch_id * batch_size
+
+            # Double array size
+            if len(predictions) <= allocation_index:
+                predictions = np.pad(predictions, ((0, len(predictions)),
+                                                   (0, 0)), mode='constant')
+                sentence_lengths = np.pad(
+                    sentence_lengths, pad_width=((0, len(sentence_lengths)),),
+                    mode='constant')
+                if show_confidences:
+                    confidences = np.pad(confidences,
+                                         ((0, len(predictions)),
+                                          (0, 0), (0, 0)), mode='constant')
+
+            if show_confidences:
+                confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy()
+            predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy()
+            sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data))
+
+            allocation_index += num_samples_in_batch
+
+        # Convert predictions, confidences to lists from numpy
+        predictions_list = [i for i in range(0, allocation_index)]
+        confidences_list = None
+        if show_confidences:
+            confidences_list = [i for i in range(0, allocation_index)]
+
+        # Append slices of predictions to return prediction & confidence matrices
+        for index, sentence_length \
+                in enumerate(sentence_lengths[:allocation_index]):
+            predictions_list[index] = list(predictions[index][:sentence_length])
+            if show_confidences:
+                confidences_list[index] = list(confidences[index][:sentence_length])
+
+        if show_confidences:
+            return {'pred': predictions_list, 'conf': confidences_list}
+        return {'pred': predictions_list}
+
+    def details(self):
+        """
+        Prints the relevant details of the model (summary, parameters, label
+        mapping)
+        """
+        print("\n###### Model Details ######\n")
+        self._model.summary()
+        print("\nModel Parameters:")
+        for key, value in self._parameters.items():
+            print("{}: {}".format(key, value))
+        print("\nModel Label Mapping:")
+        for key, value in self.label_mapping.items():
+            print("{}: {}".format(key, value))

From 391797f966e5e31bbb98dcb6fc7d9bacfd988bf0 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 5 Apr 2022 16:48:02 -0500
Subject: [PATCH 02/14] feat: load any tf model

---
 .../labelers/char_load_tf_trainable.py        | 762 ++++++++++++++++++
 1 file changed, 762 insertions(+)
 create mode 100644 dataprofiler/labelers/char_load_tf_trainable.py

diff --git a/dataprofiler/labelers/char_load_tf_trainable.py b/dataprofiler/labelers/char_load_tf_trainable.py
new file mode 100644
index 000000000..492b4324f
--- /dev/null
+++ b/dataprofiler/labelers/char_load_tf_trainable.py
@@ -0,0 +1,762 @@
+import json
+import copy
+import os
+import sys
+import time
+import logging
+from collections import defaultdict
+import functools
+
+import tensorflow as tf
+import numpy as np
+from sklearn import decomposition
+
+from . import labeler_utils
+from .base_model import BaseModel, BaseTrainableModel
+from .base_model import AutoSubRegistrationMeta
+from .. import dp_logging
+
+_file_dir = os.path.dirname(os.path.abspath(__file__))
+
+logger = dp_logging.get_child_logger(__name__)
+
+
+class NoV1ResourceMessageFilter(logging.Filter):
+    """Removes TF2 warning for using TF1 model which has resources."""
+    def filter(self, record):
+        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
+            'running tf.compat.v1.enable_resource_variables()'
+        return msg not in record.getMessage()
+
+
+tf_logger = logging.getLogger('tensorflow')
+tf_logger.addFilter(NoV1ResourceMessageFilter())
+
+
+def protected_register_keras_serializable(package='Custom', name=None):
+    """
+    Protects against already registered keras serializable layers. This
+    ensures that if it was already registered, it will not try to register it
+    again.
+    """
+    def decorator(arg):
+        """Protects against double registration of a keras layer."""
+        class_name = name if name is not None else arg.__name__
+        registered_name = package + '>' + class_name
+        if tf.keras.utils.get_registered_object(registered_name) is None:
+            tf.keras.utils.register_keras_serializable(package, name)(arg)
+        return arg
+    return decorator
+
+
+@protected_register_keras_serializable()
+class FBetaScore(tf.keras.metrics.Metric):
+    r"""Computes F-Beta score.
+    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the weighted harmonic mean of precision
+    and recall. Output range is `[0, 1]`. Works for
+    both multi-class and multi-label classification.
+    $$
+    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro` and
+            `weighted`. Default value is None.
+        beta: Determines the weight of precision and recall
+            in harmonic mean. Determines the weight given to the
+            precision and recall. Default value is 1.
+        threshold: Elements of `y_pred` greater than threshold are
+            converted to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-Beta Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
+                 name="fbeta_score", dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype)
+
+        if average not in (None, "micro", "macro", "weighted"):
+            raise ValueError(
+                "Unknown average type. Acceptable values "
+                "are: [None, 'micro', 'macro', 'weighted']"
+            )
+
+        if not isinstance(beta, float):
+            raise TypeError("The value of beta should be a python float")
+
+        if beta <= 0.0:
+            raise ValueError("beta value should be greater than zero")
+
+        if threshold is not None:
+            if not isinstance(threshold, float):
+                raise TypeError("The value of threshold should be a python float")
+            if threshold > 1.0 or threshold <= 0.0:
+                raise ValueError("threshold should be between 0 and 1")
+
+        self.num_classes = num_classes
+        self.average = average
+        self.beta = beta
+        self.threshold = threshold
+        self.axis = None
+        self.init_shape = []
+
+        if self.average != "micro":
+            self.axis = 0
+            self.init_shape = [self.num_classes]
+
+        def _zero_wt_init(name):
+            return self.add_weight(
+                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
+            )
+
+        self.true_positives = _zero_wt_init("true_positives")
+        self.false_positives = _zero_wt_init("false_positives")
+        self.false_negatives = _zero_wt_init("false_negatives")
+        self.weights_intermediate = _zero_wt_init("weights_intermediate")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        if self.threshold is None:
+            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
+            # make sure [0, 0, 0] doesn't become [1, 1, 1]
+            # Use abs(x) > eps, instead of x != 0 to check for zero
+            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
+        else:
+            y_pred = y_pred > self.threshold
+
+        y_true = tf.cast(y_true, self.dtype)
+        y_pred = tf.cast(y_pred, self.dtype)
+
+        def _weighted_sum(val, sample_weight):
+            if sample_weight is not None:
+                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
+            return tf.reduce_sum(val, axis=self.axis)
+
+        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
+        self.false_positives.assign_add(
+            _weighted_sum(y_pred * (1 - y_true), sample_weight)
+        )
+        self.false_negatives.assign_add(
+            _weighted_sum((1 - y_pred) * y_true, sample_weight)
+        )
+        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
+
+    def result(self):
+        precision = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_positives
+        )
+        recall = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_negatives
+        )
+
+        mul_value = precision * recall
+        add_value = (tf.math.square(self.beta) * precision) + recall
+        mean = tf.math.divide_no_nan(mul_value, add_value)
+        f1_score = mean * (1 + tf.math.square(self.beta))
+
+        if self.average == "weighted":
+            weights = tf.math.divide_no_nan(
+                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
+            )
+            f1_score = tf.reduce_sum(f1_score * weights)
+
+        elif self.average is not None:  # [micro, macro]
+            f1_score = tf.reduce_mean(f1_score)
+
+        return f1_score
+
+    def get_config(self):
+        """Returns the serializable config of the metric."""
+
+        config = {
+            "num_classes": self.num_classes,
+            "average": self.average,
+            "beta": self.beta,
+            "threshold": self.threshold,
+        }
+
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def reset_states(self):
+        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
+        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
+
+
+@protected_register_keras_serializable()
+class F1Score(FBetaScore):
+    r"""Computes F-1 Score.
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the harmonic mean of precision and recall.
+    Output range is `[0, 1]`. Works for both multi-class
+    and multi-label classification.
+    $$
+    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro`
+            and `weighted`. Default value is None.
+        threshold: Elements of `y_pred` above threshold are
+            considered to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-1 Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, threshold=None,
+                 name="f1_score", dtype=None):
+        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
+
+    def get_config(self):
+        base_config = super().get_config()
+        del base_config["beta"]
+        return base_config
+
+
+class CharLoadTFCnnModel(BaseTrainableModel,
+                         metaclass=AutoSubRegistrationMeta):
+
+    # boolean if the label mapping requires the mapping for index 0 reserved
+    requires_zero_mapping = False
+
+    def __init__(self, original_model_path, label_mapping=None,
+                 parameters=None):
+        """
+        CNN Model Initializer. initialize epoch_id
+
+        :param label_mapping: maps labels to their encoded integers
+        :type label_mapping: dict
+        :param parameters: Contains all the appropriate parameters for the
+            model. Must contain num_labels. Other possible parameters are:
+                max_length, max_char_encoding_id, dim_embed, size_fc
+                dropout, size_conv, num_fil, optimizer, default_label
+        :type parameters: dict
+        :return: None
+        """
+
+        # parameter initialization
+        if not parameters:
+            parameters = {}
+        parameters.setdefault('default_label', "UNKNOWN")
+        parameters['original_model_path'] = original_model_path
+        parameters['pad_label'] = 'PAD'
+        self._epoch_id = 0
+
+        # reconstruct flags for model
+        self._model_num_labels = 0
+        self._model_default_ind = -1
+
+        BaseModel.__init__(self, label_mapping, parameters)
+
+    def __eq__(self, other):
+        """
+        Checks if two models are equal with one another, may only check
+        important variables, i.e. may not check model itself.
+
+        :param self: a model
+        :param other: a model
+        :type self: BaseModel
+        :type other: BaseModel
+        :return: Whether or not self and other are equal
+        :rtype: bool
+        """
+        if self._parameters != other._parameters \
+                or self._label_mapping != other._label_mapping:
+            return False
+        return True
+
+    def _validate_parameters(self, parameters):
+        """
+        Validate the parameters sent in. Raise error if invalid parameters are
+        present.
+
+        :param parameters: parameter dict containing the following parameters:
+            max_length: Maximum char length in a sample
+            max_char_encoding_id: Maximum integer value for encoding the input
+            dim_embed: Number of embedded dimensions
+            size_fc: Size of each fully connected layers
+            dropout: Ratio of dropout in the model
+            size_conv: Convolution kernel size
+            default_label: Key for label_mapping that is the default label
+            pad_label: Key for entities_dict that is the pad label
+            num_fil: Number of filters in each convolution layer
+        :type parameters: dict
+        :return: None
+        """
+        errors = []
+        list_of_necessary_params = ['original_model_path', 'default_label',
+                                    'pad_label']
+
+        # Make sure the necessary parameters are present and valid.
+        for param in parameters:
+            if param in ['default_label', 'original_model_path', 'pad_label']:
+                if not isinstance(parameters[param], str):
+                    error = str(param) + " must be a string."
+                    errors.append(error)
+
+        # Error if there are extra parameters thrown in
+        for param in parameters:
+            if param not in list_of_necessary_params:
+                errors.append(param + " is not an accepted parameter.")
+        if errors:
+            raise ValueError('\n'.join(errors))
+
+    def set_label_mapping(self, label_mapping):
+        """
+        Sets the labels for the model
+
+        :param label_mapping: label mapping of the model
+        :type label_mapping: dict
+        :return: None
+        """
+        if not isinstance(label_mapping, (list, dict)):
+            raise TypeError("Labels must either be a non-empty encoding dict "
+                            "which maps labels to index encodings or a list.")
+
+        label_mapping = copy.deepcopy(label_mapping)
+        if 'PAD' not in label_mapping:
+            if isinstance(label_mapping, list):  # if list missing PAD
+                label_mapping = ['PAD'] + label_mapping
+            elif 0 not in label_mapping.values():  # if dict missing PAD and 0
+                label_mapping.update({'PAD': 0})
+            else:
+                label_mapping.update(
+                    {'PAD': max(list(label_mapping.values())) + 1})
+        if self._parameters['default_label'] not in label_mapping:
+            raise ValueError("The `default_label` of {} must exist in the "
+                             "label mapping.".format(
+                                self._parameters['default_label']))
+        super().set_label_mapping(label_mapping)
+
+    def _need_to_reconstruct_model(self):
+        """
+        Determines whether or not the model needs to be reconstructed.
+
+        :return: bool of whether or not the model needs to reconstruct.
+        """
+        if not self._model:
+            return False
+        default_ind = self.label_mapping[self._parameters['default_label']]
+        return self.num_labels != self._model_num_labels or \
+            default_ind != self._model_default_ind
+
+    def save_to_disk(self, dirpath):
+        """
+        Saves whole model to disk with weights
+
+        :param dirpath: directory path where you want to save the model to
+        :type dirpath: str
+        :return: None
+        """
+        if not self._model:
+            self._construct_model()
+        elif self._need_to_reconstruct_model():
+            self._reconstruct_model()
+
+        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
+        with open(model_param_dirpath, 'w') as fp:
+            json.dump(self._parameters, fp)
+        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
+        with open(labels_dirpath, 'w') as fp:
+            json.dump(self.label_mapping, fp)
+        self._model.save(os.path.join(dirpath))
+
+    @classmethod
+    def load_from_disk(cls, dirpath):
+        """
+        Loads whole model from disk with weights
+
+        :param dirpath: directory path where you want to load the model from
+        :type dirpath: str
+        :return: None
+        """
+
+        # load parameters
+        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
+        with open(model_param_dirpath, 'r') as fp:
+            parameters = json.load(fp)
+
+        # load label_mapping
+        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
+        with open(labels_dirpath, 'r') as fp:
+            label_mapping = json.load(fp)
+
+        # use f1 score metric
+        custom_objects = {
+            "F1Score": F1Score(
+                num_classes=max(label_mapping.values()) + 1,
+                average='micro'),
+            "CharacterLevelCnnModel": cls,
+        }
+        with tf.keras.utils.custom_object_scope(custom_objects):
+            tf_model = tf.keras.models.load_model(dirpath)
+
+        loaded_model = cls(label_mapping, parameters)
+        loaded_model._model = tf_model
+
+        # load self
+        loaded_model._model_num_labels = loaded_model.num_labels
+        loaded_model._model_default_ind = loaded_model.label_mapping[
+            loaded_model._parameters['default_label']
+        ]
+        return loaded_model
+
+    def _get_layer_index(self, layer_name):
+        for idx, layer in enumerate(self._model.layers):
+            if layer.name == layer_name:
+                return idx
+
+    def _construct_model(self):
+        """
+        Model constructor for the data labeler. This also serves as a weight
+        reset.
+
+        :return: None
+        """
+        num_labels = self.num_labels
+        default_ind = self.label_mapping[self._parameters['default_label']]
+        model_loc = self._parameters['original_model_path']
+
+        self._model = tf.keras.models.load_model(model_loc)
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        softmax_layer_ind = self._get_layer_index(softmax_output_layer_name)
+        softmax_layer = self._model.get_layer(softmax_output_layer_name)
+        prev_softmax_layer = softmax_layer.input
+
+        new_softmax_layer = softmax_layer.output
+        if softmax_layer.weights[0].shape[-1] != num_labels:
+            new_softmax_layer = tf.keras.layers.Dense(
+                num_labels, activation='softmax', name="softmax_output")(
+                self._model.layers[softmax_layer_ind - 1].output)
+
+        # Output the model into a .pb file for TensorFlow
+        argmax_layer = tf.keras.backend.argmax(new_softmax_layer)
+
+
+        argmax_outputs = [new_softmax_layer, argmax_layer]
+        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
+
+        # Compile the model w/ metrics
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+
+        # use f1 score metric
+        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+
+        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+
+        self._epoch_id = 0
+        self._model_num_labels = num_labels
+        self._model_default_ind = default_ind
+
+    def reset_weights(self):
+        """
+        Reset the weights of the model.
+
+        :return: None
+        """
+        self._construct_model()
+
+    def _reconstruct_model(self):
+        """
+        Reconstruct the appropriate layers if the number of number of labels is
+        altered
+
+        :return: None
+        """
+
+        # Reset model
+        tf.keras.backend.clear_session()
+
+        num_labels = self.num_labels
+        default_ind = self.label_mapping[self._parameters['default_label']]
+
+        # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax')
+        for _ in range(2):
+            self._model.layers.pop()
+
+        # Add the final Softmax layer to the previous spot
+        final_softmax_layer = tf.keras.layers.Dense(
+            num_labels, activation='softmax', name="softmax_output")(
+            self._model.layers[-4].output)
+
+        # Output the model into a .pb file for TensorFlow
+        argmax_layer = tf.keras.backend.argmax(final_softmax_layer)
+
+
+        argmax_outputs = [final_softmax_layer, argmax_layer]
+        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
+
+        # Compile the model
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        losses = {softmax_output_layer_name: "categorical_crossentropy"}
+
+        # use f1 score metric
+        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+
+        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
+
+        self._epoch_id = 0
+        self._model_num_labels = num_labels
+        self._model_default_ind = default_ind
+
+    def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
+            reset_weights=False, verbose=True):
+        """
+        Train the current model with the training data and validation data
+
+        :param train_data: Training data used to train model
+        :type train_data: Union[list, np.ndarray]
+        :param val_data: Validation data used to validate the training
+        :type val_data: Union[list, np.ndarray]
+        :param batch_size: Used to determine number of samples in each batch
+        :type batch_size: int
+        :param label_mapping: maps labels to their encoded integers
+        :type label_mapping: Union[dict, None]
+        :param reset_weights: Flag to determine whether to reset the weights or
+            not
+        :type reset_weights: bool
+        :param verbose: Flag to determine whether to print status or not
+        :type verbose: bool
+        :return: None
+        """
+
+        if label_mapping is not None:
+            self.set_label_mapping(label_mapping)
+
+        if not self._model:
+            self._construct_model()
+        else:
+            if self._need_to_reconstruct_model():
+                self._reconstruct_model()
+            if reset_weights:
+                self.reset_weights()
+
+        history = defaultdict()
+        f1 = None
+        f1_report = []
+
+        self._model.reset_metrics()
+        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+
+        start_time = time.time()
+        batch_id = 0
+        for x_train, y_train in train_data:
+            model_results = self._model.train_on_batch(
+                x_train, {softmax_output_layer_name: y_train})
+            sys.stdout.flush()
+            if verbose:
+                sys.stdout.write(
+                    "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - "
+                    "f1_score %f" %
+                    (self._epoch_id, batch_id, *model_results[1:]))
+            batch_id += 1
+
+        for i, metric_label in enumerate(self._model.metrics_names):
+            history[metric_label] = model_results[i]
+
+        if val_data:
+            f1, f1_report = self._validate_training(val_data)
+            history['f1_report'] = f1_report
+
+            val_f1 = f1_report['weighted avg']['f1-score'] \
+                if f1_report else np.NAN
+            val_precision = f1_report['weighted avg']['precision'] \
+                if f1_report else np.NAN
+            val_recall = f1_report['weighted avg']['recall'] \
+                if f1_report else np.NAN
+            epoch_time = time.time() - start_time
+            logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
+                        "val_f1: %f - val_precision: %f - val_recall %f" %
+                        (self._epoch_id, epoch_time, *model_results[1:],
+                         val_f1, val_precision, val_recall))
+
+        self._epoch_id += 1
+
+        return history, f1, f1_report
+
+    def _validate_training(self, val_data, batch_size_test=32,
+                           verbose_log=True, verbose_keras=False):
+        """
+        Validate the model on the test set and return the evaluation metrics.
+
+        :param val_data: data generator for the validation
+        :type val_data: iterator
+        :param batch_size_test: Number of samples to process in testing
+        :type batch_size_test: int
+        :param verbose_log: whether or not to print out scores for training,
+            etc.
+        :type verbose_log: bool
+        :param verbose_keras: whether or not to print out scores for training,
+            from keras.
+        :type verbose_keras: bool
+        return (f1-score, f1 report).
+        """
+        f1 = None
+        f1_report = None
+
+        if val_data is None:
+            return f1, f1_report
+
+        # Predict on the test set
+        batch_id = 0
+        y_val_pred = []
+        y_val_test = []
+        for x_val, y_val in val_data:
+            y_val_pred.append(self._model.predict(
+                x_val, batch_size=batch_size_test, verbose=verbose_keras)[1])
+            y_val_test.append(np.argmax(y_val, axis=-1))
+            batch_id += 1
+            sys.stdout.flush()
+            if verbose_log:
+                sys.stdout.write("\rEPOCH %g, validation_batch_id %d" %
+                                 (self._epoch_id, batch_id))
+
+        tf.keras.backend.set_floatx('float32')
+        # Clean the predicted entities and the actual entities
+        f1, f1_report = labeler_utils.evaluate_accuracy(
+            np.concatenate(y_val_pred, axis=0),
+            np.concatenate(y_val_test, axis=0),
+            self.num_labels,
+            self.reverse_label_mapping,
+            verbose=verbose_keras)
+
+        return f1, f1_report
+
+    def predict(self, data, batch_size=32, show_confidences=False,
+                verbose=True):
+        """
+        Run model and get predictions
+
+        :param data: text input
+        :type data: Union[list, numpy.ndarray]
+        :param batch_size: number of samples in the batch of data
+        :type batch_size: int
+        :param show_confidences: whether user wants prediction confidences
+        :type show_confidences:
+        :param verbose: Flag to determine whether to print status or not
+        :type verbose: bool
+        :return: char level predictions and confidences
+        :rtype: dict
+        """
+        if not self._model:
+            raise ValueError("You are trying to predict without a model. "
+                             "Construct/Load a model before predicting.")
+        elif self._need_to_reconstruct_model():
+            raise RuntimeError("The model label mapping definitions have been "
+                               "altered without additional training. Please "
+                               "train the model or reset the label mapping to "
+                               "predict.")
+        # Pre-allocate space for predictions
+        confidences = []
+        # sentence_lengths = np.zeros((batch_size,), dtype=int)
+        # predictions = np.zeros((batch_size, self._parameters['max_length']))
+        predictions = []
+        # if show_confidences:
+        #     confidences = np.zeros((batch_size,
+        #                             self._parameters['max_length'],
+        #                             self.num_labels))
+
+        # Run model with batching
+        allocation_index = 0
+        for batch_id, batch_data in enumerate(data):
+            model_output = self._model(
+                tf.convert_to_tensor(batch_data)
+            )
+
+            # Count number of samples in batch to prevent array mismatch
+            num_samples_in_batch = len(batch_data)
+            allocation_index = batch_id * batch_size
+
+            # Double array size
+            if len(predictions) <= allocation_index:
+                predictions += predictions
+                # sentence_lengths = np.pad(
+                #     sentence_lengths, pad_width=((0, len(sentence_lengths)),),
+                #     mode='constant')
+                if show_confidences:
+                    confidences += confidences
+
+            if show_confidences:
+                confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy()
+            predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy()
+            # sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data))
+
+            allocation_index += num_samples_in_batch
+
+        # Convert predictions, confidences to lists from numpy
+        predictions = [predictions[i].tolist() for i in range(0, allocation_index)]
+        confidences_list = None
+        if show_confidences:
+            confidences = [confidences[i].tolist()
+                           for i in range(0, allocation_index)]
+
+        # # Append slices of predictions to return prediction & confidence matrices
+        # for index, sentence_length \
+        #         in enumerate(sentence_lengths[:allocation_index]):
+        #     predictions_list[index] = list(predictions[index][:sentence_length])
+        #     if show_confidences:
+        #         confidences_list[index] = list(confidences[index][:sentence_length])
+
+        if show_confidences:
+            return {'pred': predictions, 'conf': confidences}
+        return {'pred': predictions}
+
+    def details(self):
+        """
+        Prints the relevant details of the model (summary, parameters, label
+        mapping)
+        """
+        print("\n###### Model Details ######\n")
+        self._model.summary()
+        print("\nModel Parameters:")
+        for key, value in self._parameters.items():
+            print("{}: {}".format(key, value))
+        print("\nModel Label Mapping:")
+        for key, value in self.label_mapping.items():
+            print("{}: {}".format(key, value))

From e66e0b48cb31a306c554a8820651ae5dd967602d Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Mon, 4 Jul 2022 11:21:12 -0500
Subject: [PATCH 03/14] refactor: for new model

---
 ..._tf_trainable.py => char_load_tf_model.py} | 270 ++----------------
 .../labelers/character_level_cnn_model.py     | 264 +----------------
 dataprofiler/labelers/labeler_utils.py        | 254 ++++++++++++++++
 3 files changed, 282 insertions(+), 506 deletions(-)
 rename dataprofiler/labelers/{char_load_tf_trainable.py => char_load_tf_model.py} (64%)

diff --git a/dataprofiler/labelers/char_load_tf_trainable.py b/dataprofiler/labelers/char_load_tf_model.py
similarity index 64%
rename from dataprofiler/labelers/char_load_tf_trainable.py
rename to dataprofiler/labelers/char_load_tf_model.py
index 492b4324f..568b335f7 100644
--- a/dataprofiler/labelers/char_load_tf_trainable.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -19,254 +19,22 @@
 _file_dir = os.path.dirname(os.path.abspath(__file__))
 
 logger = dp_logging.get_child_logger(__name__)
+labeler_utils.hide_tf_logger_warnings()
 
 
-class NoV1ResourceMessageFilter(logging.Filter):
-    """Removes TF2 warning for using TF1 model which has resources."""
-    def filter(self, record):
-        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
-            'running tf.compat.v1.enable_resource_variables()'
-        return msg not in record.getMessage()
-
-
-tf_logger = logging.getLogger('tensorflow')
-tf_logger.addFilter(NoV1ResourceMessageFilter())
-
-
-def protected_register_keras_serializable(package='Custom', name=None):
-    """
-    Protects against already registered keras serializable layers. This
-    ensures that if it was already registered, it will not try to register it
-    again.
-    """
-    def decorator(arg):
-        """Protects against double registration of a keras layer."""
-        class_name = name if name is not None else arg.__name__
-        registered_name = package + '>' + class_name
-        if tf.keras.utils.get_registered_object(registered_name) is None:
-            tf.keras.utils.register_keras_serializable(package, name)(arg)
-        return arg
-    return decorator
-
-
-@protected_register_keras_serializable()
-class FBetaScore(tf.keras.metrics.Metric):
-    r"""Computes F-Beta score.
-    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the weighted harmonic mean of precision
-    and recall. Output range is `[0, 1]`. Works for
-    both multi-class and multi-label classification.
-    $$
-    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro` and
-            `weighted`. Default value is None.
-        beta: Determines the weight of precision and recall
-            in harmonic mean. Determines the weight given to the
-            precision and recall. Default value is 1.
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-Beta Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
-                 name="fbeta_score", dtype=None, **kwargs):
-        super().__init__(name=name, dtype=dtype)
-
-        if average not in (None, "micro", "macro", "weighted"):
-            raise ValueError(
-                "Unknown average type. Acceptable values "
-                "are: [None, 'micro', 'macro', 'weighted']"
-            )
-
-        if not isinstance(beta, float):
-            raise TypeError("The value of beta should be a python float")
-
-        if beta <= 0.0:
-            raise ValueError("beta value should be greater than zero")
-
-        if threshold is not None:
-            if not isinstance(threshold, float):
-                raise TypeError("The value of threshold should be a python float")
-            if threshold > 1.0 or threshold <= 0.0:
-                raise ValueError("threshold should be between 0 and 1")
-
-        self.num_classes = num_classes
-        self.average = average
-        self.beta = beta
-        self.threshold = threshold
-        self.axis = None
-        self.init_shape = []
-
-        if self.average != "micro":
-            self.axis = 0
-            self.init_shape = [self.num_classes]
-
-        def _zero_wt_init(name):
-            return self.add_weight(
-                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
-            )
-
-        self.true_positives = _zero_wt_init("true_positives")
-        self.false_positives = _zero_wt_init("false_positives")
-        self.false_negatives = _zero_wt_init("false_negatives")
-        self.weights_intermediate = _zero_wt_init("weights_intermediate")
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        if self.threshold is None:
-            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
-            # make sure [0, 0, 0] doesn't become [1, 1, 1]
-            # Use abs(x) > eps, instead of x != 0 to check for zero
-            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
-        else:
-            y_pred = y_pred > self.threshold
-
-        y_true = tf.cast(y_true, self.dtype)
-        y_pred = tf.cast(y_pred, self.dtype)
-
-        def _weighted_sum(val, sample_weight):
-            if sample_weight is not None:
-                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
-            return tf.reduce_sum(val, axis=self.axis)
-
-        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
-        self.false_positives.assign_add(
-            _weighted_sum(y_pred * (1 - y_true), sample_weight)
-        )
-        self.false_negatives.assign_add(
-            _weighted_sum((1 - y_pred) * y_true, sample_weight)
-        )
-        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
-
-    def result(self):
-        precision = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_positives
-        )
-        recall = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_negatives
-        )
-
-        mul_value = precision * recall
-        add_value = (tf.math.square(self.beta) * precision) + recall
-        mean = tf.math.divide_no_nan(mul_value, add_value)
-        f1_score = mean * (1 + tf.math.square(self.beta))
-
-        if self.average == "weighted":
-            weights = tf.math.divide_no_nan(
-                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
-            )
-            f1_score = tf.reduce_sum(f1_score * weights)
-
-        elif self.average is not None:  # [micro, macro]
-            f1_score = tf.reduce_mean(f1_score)
-
-        return f1_score
-
-    def get_config(self):
-        """Returns the serializable config of the metric."""
-
-        config = {
-            "num_classes": self.num_classes,
-            "average": self.average,
-            "beta": self.beta,
-            "threshold": self.threshold,
-        }
-
-        base_config = super().get_config()
-        return {**base_config, **config}
-
-    def reset_states(self):
-        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
-        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
-
-
-@protected_register_keras_serializable()
-class F1Score(FBetaScore):
-    r"""Computes F-1 Score.
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the harmonic mean of precision and recall.
-    Output range is `[0, 1]`. Works for both multi-class
-    and multi-label classification.
-    $$
-    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro`
-            and `weighted`. Default value is None.
-        threshold: Elements of `y_pred` above threshold are
-            considered to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-1 Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, threshold=None,
-                 name="f1_score", dtype=None):
-        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
-
-    def get_config(self):
-        base_config = super().get_config()
-        del base_config["beta"]
-        return base_config
-
-
-class CharLoadTFCnnModel(BaseTrainableModel,
-                         metaclass=AutoSubRegistrationMeta):
+class CharLoadTFModel(BaseTrainableModel,
+                      metaclass=AutoSubRegistrationMeta):
 
     # boolean if the label mapping requires the mapping for index 0 reserved
     requires_zero_mapping = False
 
-    def __init__(self, original_model_path, label_mapping=None,
+    def __init__(self, model_path, label_mapping=None,
                  parameters=None):
         """
-        CNN Model Initializer. initialize epoch_id
+        Loadable TF Model Initializer.
 
+        :param model_path: path to model to load
+        :type model_path: str
         :param label_mapping: maps labels to their encoded integers
         :type label_mapping: dict
         :param parameters: Contains all the appropriate parameters for the
@@ -281,7 +49,7 @@ def __init__(self, original_model_path, label_mapping=None,
         if not parameters:
             parameters = {}
         parameters.setdefault('default_label', "UNKNOWN")
-        parameters['original_model_path'] = original_model_path
+        parameters['model_path'] = model_path
         parameters['pad_label'] = 'PAD'
         self._epoch_id = 0
 
@@ -327,12 +95,12 @@ def _validate_parameters(self, parameters):
         :return: None
         """
         errors = []
-        list_of_necessary_params = ['original_model_path', 'default_label',
+        list_of_necessary_params = ['model_path', 'default_label',
                                     'pad_label']
 
         # Make sure the necessary parameters are present and valid.
         for param in parameters:
-            if param in ['default_label', 'original_model_path', 'pad_label']:
+            if param in ['default_label', 'model_path', 'pad_label']:
                 if not isinstance(parameters[param], str):
                     error = str(param) + " must be a string."
                     errors.append(error)
@@ -426,7 +194,7 @@ def load_from_disk(cls, dirpath):
 
         # use f1 score metric
         custom_objects = {
-            "F1Score": F1Score(
+            "F1Score": labeler_utils.F1Score(
                 num_classes=max(label_mapping.values()) + 1,
                 average='micro'),
             "CharacterLevelCnnModel": cls,
@@ -444,11 +212,6 @@ def load_from_disk(cls, dirpath):
         ]
         return loaded_model
 
-    def _get_layer_index(self, layer_name):
-        for idx, layer in enumerate(self._model.layers):
-            if layer.name == layer_name:
-                return idx
-
     def _construct_model(self):
         """
         Model constructor for the data labeler. This also serves as a weight
@@ -458,11 +221,12 @@ def _construct_model(self):
         """
         num_labels = self.num_labels
         default_ind = self.label_mapping[self._parameters['default_label']]
-        model_loc = self._parameters['original_model_path']
+        model_loc = self._parameters['model_path']
 
         self._model = tf.keras.models.load_model(model_loc)
         softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
-        softmax_layer_ind = self._get_layer_index(softmax_output_layer_name)
+        softmax_layer_ind = labeler_utils.get_tf_layer_index_from_name(
+            self._model, softmax_output_layer_name)
         softmax_layer = self._model.get_layer(softmax_output_layer_name)
         prev_softmax_layer = softmax_layer.input
 
@@ -484,7 +248,8 @@ def _construct_model(self):
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        f1_score_training = labeler_utils.F1Score(
+            num_classes=num_labels, average='micro')
         metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
@@ -536,7 +301,8 @@ def _reconstruct_model(self):
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average='micro')
+        f1_score_training = labeler_utils.F1Score(
+            num_classes=num_labels, average='micro')
         metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
index 4a9e84311..c02fae1b9 100644
--- a/dataprofiler/labelers/character_level_cnn_model.py
+++ b/dataprofiler/labelers/character_level_cnn_model.py
@@ -18,253 +18,7 @@
 _file_dir = os.path.dirname(os.path.abspath(__file__))
 
 logger = dp_logging.get_child_logger(__name__)
-
-
-class NoV1ResourceMessageFilter(logging.Filter):
-    """Removes TF2 warning for using TF1 model which has resources."""
-
-    def filter(self, record):
-        msg = (
-            "is a problem, consider rebuilding the SavedModel after "
-            + "running tf.compat.v1.enable_resource_variables()"
-        )
-        return msg not in record.getMessage()
-
-
-tf_logger = logging.getLogger("tensorflow")
-tf_logger.addFilter(NoV1ResourceMessageFilter())
-
-
-def protected_register_keras_serializable(package='Custom', name=None):
-    """
-    Protects against already registered keras serializable layers. This
-    ensures that if it was already registered, it will not try to register it
-    again.
-    """
-    def decorator(arg):
-        """Protects against double registration of a keras layer."""
-        class_name = name if name is not None else arg.__name__
-        registered_name = package + '>' + class_name
-        if tf.keras.utils.get_registered_object(registered_name) is None:
-            tf.keras.utils.register_keras_serializable(package, name)(arg)
-        return arg
-    return decorator
-
-
-@protected_register_keras_serializable()
-class FBetaScore(tf.keras.metrics.Metric):
-    r"""Computes F-Beta score.
-    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the weighted harmonic mean of precision
-    and recall. Output range is `[0, 1]`. Works for
-    both multi-class and multi-label classification.
-    $$
-    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro` and
-            `weighted`. Default value is None.
-        beta: Determines the weight of precision and recall
-            in harmonic mean. Determines the weight given to the
-            precision and recall. Default value is 1.
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-Beta Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(
-        self,
-        num_classes,
-        average=None,
-        beta=1.0,
-        threshold=None,
-        name="fbeta_score",
-        dtype=None,
-        **kwargs
-    ):
-        super().__init__(name=name, dtype=dtype)
-
-        if average not in (None, "micro", "macro", "weighted"):
-            raise ValueError(
-                "Unknown average type. Acceptable values "
-                "are: [None, 'micro', 'macro', 'weighted']"
-            )
-
-        if not isinstance(beta, float):
-            raise TypeError("The value of beta should be a python float")
-
-        if beta <= 0.0:
-            raise ValueError("beta value should be greater than zero")
-
-        if threshold is not None:
-            if not isinstance(threshold, float):
-                raise TypeError("The value of threshold should be a python float")
-            if threshold > 1.0 or threshold <= 0.0:
-                raise ValueError("threshold should be between 0 and 1")
-
-        self.num_classes = num_classes
-        self.average = average
-        self.beta = beta
-        self.threshold = threshold
-        self.axis = None
-        self.init_shape = []
-
-        if self.average != "micro":
-            self.axis = 0
-            self.init_shape = [self.num_classes]
-
-        def _zero_wt_init(name):
-            return self.add_weight(
-                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
-            )
-
-        self.true_positives = _zero_wt_init("true_positives")
-        self.false_positives = _zero_wt_init("false_positives")
-        self.false_negatives = _zero_wt_init("false_negatives")
-        self.weights_intermediate = _zero_wt_init("weights_intermediate")
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        if self.threshold is None:
-            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
-            # make sure [0, 0, 0] doesn't become [1, 1, 1]
-            # Use abs(x) > eps, instead of x != 0 to check for zero
-            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
-        else:
-            y_pred = y_pred > self.threshold
-
-        y_true = tf.cast(y_true, self.dtype)
-        y_pred = tf.cast(y_pred, self.dtype)
-
-        def _weighted_sum(val, sample_weight):
-            if sample_weight is not None:
-                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
-            return tf.reduce_sum(val, axis=self.axis)
-
-        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
-        self.false_positives.assign_add(
-            _weighted_sum(y_pred * (1 - y_true), sample_weight)
-        )
-        self.false_negatives.assign_add(
-            _weighted_sum((1 - y_pred) * y_true, sample_weight)
-        )
-        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
-
-    def result(self):
-        precision = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_positives
-        )
-        recall = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_negatives
-        )
-
-        mul_value = precision * recall
-        add_value = (tf.math.square(self.beta) * precision) + recall
-        mean = tf.math.divide_no_nan(mul_value, add_value)
-        f1_score = mean * (1 + tf.math.square(self.beta))
-
-        if self.average == "weighted":
-            weights = tf.math.divide_no_nan(
-                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
-            )
-            f1_score = tf.reduce_sum(f1_score * weights)
-
-        elif self.average is not None:  # [micro, macro]
-            f1_score = tf.reduce_mean(f1_score)
-
-        return f1_score
-
-    def get_config(self):
-        """Returns the serializable config of the metric."""
-
-        config = {
-            "num_classes": self.num_classes,
-            "average": self.average,
-            "beta": self.beta,
-            "threshold": self.threshold,
-        }
-
-        base_config = super().get_config()
-        return {**base_config, **config}
-
-    def reset_states(self):
-        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
-        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
-
-
-@protected_register_keras_serializable()
-class F1Score(FBetaScore):
-    r"""Computes F-1 Score.
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the harmonic mean of precision and recall.
-    Output range is `[0, 1]`. Works for both multi-class
-    and multi-label classification.
-    $$
-    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro`
-            and `weighted`. Default value is None.
-        threshold: Elements of `y_pred` above threshold are
-            considered to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-1 Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(
-        self, num_classes, average=None, threshold=None, name="f1_score", dtype=None
-    ):
-        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
-
-    def get_config(self):
-        base_config = super().get_config()
-        del base_config["beta"]
-        return base_config
+labeler_utils.hide_tf_logger_warnings()
 
 
 def build_embd_dictionary(filename):
@@ -539,9 +293,9 @@ def load_from_disk(cls, dirpath):
 
         # use f1 score metric
         custom_objects = {
-            "F1Score": F1Score(
-                num_classes=max(label_mapping.values()) + 1, average="micro"
-            ),
+            "F1Score": labeler_utils.F1Score(
+                num_classes=max(label_mapping.values()) + 1,
+                average='micro'),
             "CharacterLevelCnnModel": cls,
         }
         with tf.keras.utils.custom_object_scope(custom_objects):
@@ -761,8 +515,9 @@ def encoding_function(input_str):
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average="micro")
-        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
+        f1_score_training = labeler_utils.F1Score(
+            num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
 
@@ -821,8 +576,9 @@ def _reconstruct_model(self):
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average="micro")
-        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
+        f1_score_training = labeler_utils.F1Score(
+            num_classes=num_labels, average='micro')
+        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
         self._epoch_id = 0
diff --git a/dataprofiler/labelers/labeler_utils.py b/dataprofiler/labelers/labeler_utils.py
index 4bf92fb02..679233e39 100644
--- a/dataprofiler/labelers/labeler_utils.py
+++ b/dataprofiler/labelers/labeler_utils.py
@@ -1,9 +1,11 @@
 import os
 import warnings
+import logging
 
 import numpy as np
 import scipy
 from sklearn.exceptions import UndefinedMetricWarning
+import tensorflow as tf
 
 from .. import dp_logging
 from .classification_report_utils import classification_report
@@ -196,3 +198,255 @@ def evaluate_accuracy(
         logger.info(f"F1 Score: {f1}")
 
     return f1, f1_report
+
+
+def get_tf_layer_index_from_name(model, layer_name):
+    """
+    Returns the index of the layer given the layer name within a tf model
+
+    :param model: tf keras model to search
+    :param layer_name: name of the layer to find
+    :return: layer index if it exists or None
+    """
+    for idx, layer in enumerate(model.layers):
+        if layer.name == layer_name:
+            return idx
+
+
+def hide_tf_logger_warnings():
+    """
+    Filters out a set of warnings from the tf logger.
+    """
+    class NoV1ResourceMessageFilter(logging.Filter):
+        """Removes TF2 warning for using TF1 model which has resources."""
+        def filter(self, record):
+            msg = 'is a problem, consider rebuilding the SavedModel after ' + \
+                'running tf.compat.v1.enable_resource_variables()'
+            return msg not in record.getMessage()
+
+
+    tf_logger = logging.getLogger('tensorflow')
+    tf_logger.addFilter(NoV1ResourceMessageFilter())
+
+
+def protected_register_keras_serializable(package='Custom', name=None):
+    """
+    Protects against already registered keras serializable layers. This
+    ensures that if it was already registered, it will not try to register it
+    again.
+    """
+    def decorator(arg):
+        """Protects against double registration of a keras layer."""
+        class_name = name if name is not None else arg.__name__
+        registered_name = package + '>' + class_name
+        if tf.keras.utils.get_registered_object(registered_name) is None:
+            tf.keras.utils.register_keras_serializable(package, name)(arg)
+        return arg
+    return decorator
+
+
+@protected_register_keras_serializable()
+class FBetaScore(tf.keras.metrics.Metric):
+    r"""Computes F-Beta score.
+    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the weighted harmonic mean of precision
+    and recall. Output range is `[0, 1]`. Works for
+    both multi-class and multi-label classification.
+    $$
+    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro` and
+            `weighted`. Default value is None.
+        beta: Determines the weight of precision and recall
+            in harmonic mean. Determines the weight given to the
+            precision and recall. Default value is 1.
+        threshold: Elements of `y_pred` greater than threshold are
+            converted to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-Beta Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
+                 name="fbeta_score", dtype=None, **kwargs):
+        super().__init__(name=name, dtype=dtype)
+
+        if average not in (None, "micro", "macro", "weighted"):
+            raise ValueError(
+                "Unknown average type. Acceptable values "
+                "are: [None, 'micro', 'macro', 'weighted']"
+            )
+
+        if not isinstance(beta, float):
+            raise TypeError("The value of beta should be a python float")
+
+        if beta <= 0.0:
+            raise ValueError("beta value should be greater than zero")
+
+        if threshold is not None:
+            if not isinstance(threshold, float):
+                raise TypeError("The value of threshold should be a python float")
+            if threshold > 1.0 or threshold <= 0.0:
+                raise ValueError("threshold should be between 0 and 1")
+
+        self.num_classes = num_classes
+        self.average = average
+        self.beta = beta
+        self.threshold = threshold
+        self.axis = None
+        self.init_shape = []
+
+        if self.average != "micro":
+            self.axis = 0
+            self.init_shape = [self.num_classes]
+
+        def _zero_wt_init(name):
+            return self.add_weight(
+                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
+            )
+
+        self.true_positives = _zero_wt_init("true_positives")
+        self.false_positives = _zero_wt_init("false_positives")
+        self.false_negatives = _zero_wt_init("false_negatives")
+        self.weights_intermediate = _zero_wt_init("weights_intermediate")
+
+    def update_state(self, y_true, y_pred, sample_weight=None):
+        if self.threshold is None:
+            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
+            # make sure [0, 0, 0] doesn't become [1, 1, 1]
+            # Use abs(x) > eps, instead of x != 0 to check for zero
+            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
+        else:
+            y_pred = y_pred > self.threshold
+
+        y_true = tf.cast(y_true, self.dtype)
+        y_pred = tf.cast(y_pred, self.dtype)
+
+        def _weighted_sum(val, sample_weight):
+            if sample_weight is not None:
+                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
+            return tf.reduce_sum(val, axis=self.axis)
+
+        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
+        self.false_positives.assign_add(
+            _weighted_sum(y_pred * (1 - y_true), sample_weight)
+        )
+        self.false_negatives.assign_add(
+            _weighted_sum((1 - y_pred) * y_true, sample_weight)
+        )
+        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
+
+    def result(self):
+        precision = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_positives
+        )
+        recall = tf.math.divide_no_nan(
+            self.true_positives, self.true_positives + self.false_negatives
+        )
+
+        mul_value = precision * recall
+        add_value = (tf.math.square(self.beta) * precision) + recall
+        mean = tf.math.divide_no_nan(mul_value, add_value)
+        f1_score = mean * (1 + tf.math.square(self.beta))
+
+        if self.average == "weighted":
+            weights = tf.math.divide_no_nan(
+                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
+            )
+            f1_score = tf.reduce_sum(f1_score * weights)
+
+        elif self.average is not None:  # [micro, macro]
+            f1_score = tf.reduce_mean(f1_score)
+
+        return f1_score
+
+    def get_config(self):
+        """Returns the serializable config of the metric."""
+
+        config = {
+            "num_classes": self.num_classes,
+            "average": self.average,
+            "beta": self.beta,
+            "threshold": self.threshold,
+        }
+
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+    def reset_states(self):
+        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
+        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
+
+
+@protected_register_keras_serializable()
+class F1Score(FBetaScore):
+    r"""Computes F-1 Score.
+
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ==============================================================================
+
+    It is the harmonic mean of precision and recall.
+    Output range is `[0, 1]`. Works for both multi-class
+    and multi-label classification.
+    $$
+    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
+    $$
+    Args:
+        num_classes: Number of unique classes in the dataset.
+        average: Type of averaging to be performed on data.
+            Acceptable values are `None`, `micro`, `macro`
+            and `weighted`. Default value is None.
+        threshold: Elements of `y_pred` above threshold are
+            considered to be 1, and the rest 0. If threshold is
+            None, the argmax is converted to 1, and the rest 0.
+        name: (Optional) String name of the metric instance.
+        dtype: (Optional) Data type of the metric result.
+    Returns:
+        F-1 Score: float.
+    """
+
+    # Modification: remove the run-time type checking for functions
+    def __init__(self, num_classes, average=None, threshold=None,
+                 name="f1_score", dtype=None):
+        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
+
+    def get_config(self):
+        base_config = super().get_config()
+        del base_config["beta"]
+        return base_config

From 2066271490cb7bf03af1f67402ce6bd0a05c6913 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 5 Jul 2022 13:54:06 -0500
Subject: [PATCH 04/14] fix: add util tests

---
 .../tests/labelers/test_labeler_utils.py      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index 0b999c41d..b352ebd9e 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -1,8 +1,10 @@
 import unittest
 from unittest import mock
+import logging
 
 import numpy as np
 import pandas as pd
+import tensorflow as tf
 
 from dataprofiler.labelers import labeler_utils
 
@@ -267,3 +269,33 @@ def test_save_conf_mat(self, mock_dataframe):
         self.assertDictEqual(expected_row_col_names, mock_dataframe.call_args[1])
 
         mock_instance_df.to_csv.assert_called()
+
+
+class TestTFFunctions(unittest.TestCase):
+
+    def test_get_tf_layer_index_from_name(self):
+        model = tf.keras.Sequential()
+        model.add(tf.keras.Input((1, 2), name='input'))
+        model.add(tf.keras.layers.Dense(units=4, name='dense0'))
+        model.add(tf.keras.layers.Dense(units=3, name='dense1'))
+
+        ind = labeler_utils.get_tf_layer_index_from_name(model, 'not a layer')
+        self.assertIsNone(ind)
+
+        # input is not counted in the layer
+        ind = labeler_utils.get_tf_layer_index_from_name(model, 'input')
+        self.assertIsNone(ind)
+
+        ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense1')
+        self.assertEqual(1, ind)
+
+        ind = labeler_utils.get_tf_layer_index_from_name(model, 'dense0')
+        self.assertEqual(0, ind)
+
+    def test_hide_tf_logger_warnings(self):
+        logger = logging.getLogger('tensorflow')
+        self.assertListEqual([], logger.filters)
+
+        # make change and validate updated filter
+        labeler_utils.hide_tf_logger_warnings()
+        self.assertEqual(1, len(logger.filters))

From c108d705811fe70d29e069ef55cfb5a7bbe2d9b9 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 5 Jul 2022 14:50:11 -0500
Subject: [PATCH 05/14] fix: remove comments

---
 dataprofiler/labelers/char_load_tf_model.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
index 568b335f7..5e1bd6701 100644
--- a/dataprofiler/labelers/char_load_tf_model.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -460,13 +460,7 @@ def predict(self, data, batch_size=32, show_confidences=False,
                                "predict.")
         # Pre-allocate space for predictions
         confidences = []
-        # sentence_lengths = np.zeros((batch_size,), dtype=int)
-        # predictions = np.zeros((batch_size, self._parameters['max_length']))
         predictions = []
-        # if show_confidences:
-        #     confidences = np.zeros((batch_size,
-        #                             self._parameters['max_length'],
-        #                             self.num_labels))
 
         # Run model with batching
         allocation_index = 0
@@ -482,16 +476,12 @@ def predict(self, data, batch_size=32, show_confidences=False,
             # Double array size
             if len(predictions) <= allocation_index:
                 predictions += predictions
-                # sentence_lengths = np.pad(
-                #     sentence_lengths, pad_width=((0, len(sentence_lengths)),),
-                #     mode='constant')
                 if show_confidences:
                     confidences += confidences
 
             if show_confidences:
                 confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy()
             predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy()
-            # sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data))
 
             allocation_index += num_samples_in_batch
 
@@ -502,13 +492,6 @@ def predict(self, data, batch_size=32, show_confidences=False,
             confidences = [confidences[i].tolist()
                            for i in range(0, allocation_index)]
 
-        # # Append slices of predictions to return prediction & confidence matrices
-        # for index, sentence_length \
-        #         in enumerate(sentence_lengths[:allocation_index]):
-        #     predictions_list[index] = list(predictions[index][:sentence_length])
-        #     if show_confidences:
-        #         confidences_list[index] = list(confidences[index][:sentence_length])
-
         if show_confidences:
             return {'pred': predictions, 'conf': confidences}
         return {'pred': predictions}

From 84c3d9c0614a30f1c1857aacd810cd99e6d33320 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 11:37:17 -0500
Subject: [PATCH 06/14] feat: add test for data loading

---
 .../test_char_load_tf_data_labeler.py         | 148 ++++++++++++++++++
 1 file changed, 148 insertions(+)
 create mode 100644 dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py

diff --git a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py
new file mode 100644
index 000000000..2f8237db5
--- /dev/null
+++ b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py
@@ -0,0 +1,148 @@
+import json
+import os
+import unittest
+from io import StringIO
+from unittest import mock
+
+from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, \
+    data_processing
+from dataprofiler.labelers.char_load_tf_model import \
+    CharLoadTFModel
+
+
+test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+
+data_labeler_parameters = {
+    'model': {
+        'class': 'CharLoadTFModel',
+        'parameters': {}
+    },
+    'label_mapping': {
+        'PAD': 0,
+        'CITY': 1,  # SAME AS UNKNOWN
+        'UNKNOWN': 1,
+        'ADDRESS': 2,
+        'PERSON': 3,
+    },
+    'preprocessor': {
+        'class': 'CharEncodedPreprocessor'
+    },
+    'postprocessor': {
+        'class': 'CharPostprocessor'
+    },
+}
+
+preprocessor_parameters = {
+    'encoding_map': {'t': 1, 's': 2},
+    'flatten_split': 0,
+    'flatten_separator': ' ',
+    'is_separate_at_max_len': True,
+
+}
+
+postprocessor_parameters = {
+    'use_word_level_argmax': True,
+    'output_format': 'character_argmax',
+    'separators': (' ', ',', ';', "'", '"', ':', '\n', '\t', "."),
+    'word_level_min_percent': 0.75,
+}
+
+
+def mock_open(filename, *args):
+    if filename.find('data_labeler_parameters') >= 0:
+        return StringIO(json.dumps(data_labeler_parameters))
+    elif filename.find('preprocessor_parameters') >= 0:
+        return StringIO(json.dumps(preprocessor_parameters))
+    elif filename.find('postprocessor_parameters') >= 0:
+        return StringIO(json.dumps(postprocessor_parameters))
+
+
+def setup_save_mock_open(mock_open):
+    mock_file = StringIO()
+    mock_file.close = lambda: None
+    mock_open.side_effect = lambda *args: mock_file
+    return mock_file
+
+
+@mock.patch('dataprofiler.labelers.data_processing.BaseDataProcessor')
+@mock.patch('dataprofiler.labelers.char_load_tf_model.'
+            'CharLoadTFModel.load_from_disk')
+@mock.patch("builtins.open", side_effect=mock_open)
+class TestCharTFLoadDataLabeler(unittest.TestCase):
+
+    @staticmethod
+    def _setup_mock_load_model(mock_load_model):
+        model_mock = mock.Mock(spec=CharLoadTFModel)
+        model_mock.set_num_labels = mock.Mock()
+        mock_load_model.return_value = model_mock
+        model_mock.requires_zero_mapping = True
+        model_mock.labels = ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON']
+        model_mock.label_mapping = {
+            'PAD': 0,
+            'CITY': 1,  # SAME AS UNKNOWN
+            'UNKNOWN': 1,
+            'ADDRESS': 2,
+            'PERSON': 3,
+        }
+        model_mock.reverse_label_mapping = {
+            0: 'PAD',
+            1: 'UNKNOWN',
+            2: 'ADDRESS',
+            3: 'PERSON',
+        }
+
+    @staticmethod
+    def _setup_mock_load_processor(mock_base_processor):
+        def side_effect(arg):
+            processor = {
+                'CharEncodedPreprocessor': mock.Mock(
+                    spec=data_processing.CharEncodedPreprocessor),
+                'CharPostprocessor': mock.Mock(
+                    spec=data_processing.CharPostprocessor),
+            }[arg]
+            processor.load_from_disk.return_value = processor
+            return processor
+
+        mock_base_processor.get_class.side_effect = side_effect
+
+    def test_load_from_disk(self, mock_open, mock_load_model,
+                            mock_base_processor):
+
+        self._setup_mock_load_model(mock_load_model)
+        self._setup_mock_load_processor(mock_base_processor)
+
+        # load default
+        data_labeler = DataLabeler.load_from_disk('fake/path')
+
+        self.assertDictEqual(data_labeler.label_mapping,
+                             data_labeler_parameters['label_mapping'])
+        self.assertListEqual(
+            data_labeler.labels,
+            ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON'])
+        self.assertIsInstance(
+            data_labeler.preprocessor, data_processing.BaseDataPreprocessor)
+        self.assertIsInstance(
+            data_labeler.postprocessor, data_processing.BaseDataPostprocessor)
+
+    def test_save_to_disk(self, mock_open, mock_load_model,
+                          mock_load_processor, *mocks):
+
+        self._setup_mock_load_model(mock_load_model)
+        self._setup_mock_load_processor(mock_load_processor)
+
+        # call func
+        data_labeler = UnstructuredDataLabeler()
+
+        # setup save mock
+        mock_file = setup_save_mock_open(mock_open)
+
+        # save and test
+        data_labeler.save_to_disk('test/path')
+        self.assertEqual(
+            '{"model": {"class": "CharLoadTFModel"}, '
+            '"preprocessor": {"class": "CharEncodedPreprocessor"}, '
+            '"postprocessor": {"class": "CharPostprocessor"}}',
+            mock_file.getvalue())
+
+        # close mock
+        StringIO.close(mock_file)

From c60e678bc31c591b6c4d5596c5ec1cd86c7d9e06 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 17:04:38 -0500
Subject: [PATCH 07/14] fix: imports

---
 dataprofiler/labelers/char_load_tf_model.py        | 3 ---
 dataprofiler/labelers/character_level_cnn_model.py | 2 --
 2 files changed, 5 deletions(-)

diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
index 5e1bd6701..0a1f40454 100644
--- a/dataprofiler/labelers/char_load_tf_model.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -3,13 +3,10 @@
 import os
 import sys
 import time
-import logging
 from collections import defaultdict
-import functools
 
 import tensorflow as tf
 import numpy as np
-from sklearn import decomposition
 
 from . import labeler_utils
 from .base_model import BaseModel, BaseTrainableModel
diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
index c02fae1b9..3c5e50957 100644
--- a/dataprofiler/labelers/character_level_cnn_model.py
+++ b/dataprofiler/labelers/character_level_cnn_model.py
@@ -1,11 +1,9 @@
 import copy
 import json
-import logging
 import os
 import sys
 import time
 from collections import defaultdict
-import functools
 
 import numpy as np
 import tensorflow as tf

From d95e37537afd54d02ac80cb98caf5741a7fd7e4f Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 17:05:20 -0500
Subject: [PATCH 08/14] feat: add tests

---
 dataprofiler/labelers/char_load_tf_model.py   |  19 +-
 .../tests/labelers/test_char_tf_load_model.py | 373 ++++++++++++++++++
 2 files changed, 384 insertions(+), 8 deletions(-)
 create mode 100644 dataprofiler/tests/labelers/test_char_tf_load_model.py

diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
index 0a1f40454..5abc1fa96 100644
--- a/dataprofiler/labelers/char_load_tf_model.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -162,12 +162,14 @@ def save_to_disk(self, dirpath):
             self._reconstruct_model()
 
         model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
+        model_parameters = self._parameters.copy()
+        model_parameters.pop("model_path")
         with open(model_param_dirpath, 'w') as fp:
-            json.dump(self._parameters, fp)
+            json.dump(model_parameters, fp)
         labels_dirpath = os.path.join(dirpath, "label_mapping.json")
         with open(labels_dirpath, 'w') as fp:
             json.dump(self.label_mapping, fp)
-        self._model.save(os.path.join(dirpath))
+        self._model.save(dirpath)
 
     @classmethod
     def load_from_disk(cls, dirpath):
@@ -194,12 +196,12 @@ def load_from_disk(cls, dirpath):
             "F1Score": labeler_utils.F1Score(
                 num_classes=max(label_mapping.values()) + 1,
                 average='micro'),
-            "CharacterLevelCnnModel": cls,
+            "CharLoadTFModel": cls,
         }
         with tf.keras.utils.custom_object_scope(custom_objects):
             tf_model = tf.keras.models.load_model(dirpath)
 
-        loaded_model = cls(label_mapping, parameters)
+        loaded_model = cls(dirpath, label_mapping, parameters)
         loaded_model._model = tf_model
 
         # load self
@@ -448,8 +450,7 @@ def predict(self, data, batch_size=32, show_confidences=False,
         :rtype: dict
         """
         if not self._model:
-            raise ValueError("You are trying to predict without a model. "
-                             "Construct/Load a model before predicting.")
+            self._construct_model()
         elif self._need_to_reconstruct_model():
             raise RuntimeError("The model label mapping definitions have been "
                                "altered without additional training. Please "
@@ -468,7 +469,6 @@ def predict(self, data, batch_size=32, show_confidences=False,
 
             # Count number of samples in batch to prevent array mismatch
             num_samples_in_batch = len(batch_data)
-            allocation_index = batch_id * batch_size
 
             # Double array size
             if len(predictions) <= allocation_index:
@@ -483,7 +483,7 @@ def predict(self, data, batch_size=32, show_confidences=False,
             allocation_index += num_samples_in_batch
 
         # Convert predictions, confidences to lists from numpy
-        predictions = [predictions[i].tolist() for i in range(0, allocation_index)]
+        predictions = [predictions[i].tolist() for i in range(allocation_index)]
         confidences_list = None
         if show_confidences:
             confidences = [confidences[i].tolist()
@@ -498,6 +498,9 @@ def details(self):
         Prints the relevant details of the model (summary, parameters, label
         mapping)
         """
+        if not self._model:
+            self._construct_model()
+
         print("\n###### Model Details ######\n")
         self._model.summary()
         print("\nModel Parameters:")
diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py
new file mode 100644
index 000000000..8f06af6f2
--- /dev/null
+++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py
@@ -0,0 +1,373 @@
+import json
+import os
+import unittest
+from io import StringIO
+from unittest import mock
+
+import numpy as np
+import pandas as pd
+import pkg_resources
+import tensorflow as tf
+
+from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel
+
+_file_dir = os.path.dirname(os.path.abspath(__file__))
+_resource_labeler_dir = pkg_resources.resource_filename("resources", "labelers")
+
+
+mock_model_parameters = {
+    "model_path": "project/example/path/fake_model.h5",
+    "default_label": "UNKNOWN",
+}
+
+
+mock_label_mapping = {
+    "PAD": 0,
+    "CITY": 1,  # ensure that overlapping labels get removed.
+    "UNKNOWN": 1,
+    "ADDRESS": 2,
+}
+
+
+def mock_tf_model(*args, **kwargs):
+    model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.int64))
+    model.add(tf.keras.layers.Embedding(
+        input_dim=100,
+        output_dim=30,
+        embeddings_initializer="normal",
+        trainable=True))
+    model.add(tf.keras.layers.Dense(units=10, activation="relu"))
+    model.add(tf.keras.layers.Dense(10, activation="softmax"))
+    return model
+
+
+def mock_open(filename, *args):
+    if filename.find("model_parameters") >= 0:
+        return StringIO(json.dumps(mock_model_parameters))
+    elif filename.find("label_mapping") >= 0:
+        return StringIO(json.dumps(mock_label_mapping))
+
+
+def setup_save_mock_open(mock_open):
+    mock_file = StringIO()
+    mock_file.close = lambda: None
+    mock_open.side_effect = lambda *args: mock_file
+    return mock_file
+
+
+@mock.patch('tensorflow.keras.models.load_model', side_effect=mock_tf_model)
+class TestCharLoadTFModel(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        # data
+        cls.df = pd.DataFrame(
+            {
+                0: [
+                    "MUCH xerophytic GOOFPROOF. Ranch Declarerevise health WITH "
+                    "zinc Rhizoctinia.INCULCATION suntrapMordacity `GUAN... "
+                    "NECROMANTIC` HAVE mastopathy_nonfeasance_DEMOCRAT 26/09/95 "
+                    "18:16 HE sugarcoat [8eec39e5-8acc-40ca-b424-7171ac49131b] "
+                    "ourselves"
+                ],
+                1: [[[164, 178, "DATETIME"], [193, 229, "UUID"]]],
+            }
+        )
+        cls.label_mapping = {
+            "PAD": 0,
+            "CITY": 1,  # SAME AS UNKNOWN, ensure that overlapping
+            "UNKNOWN": 1,  # labels get removed.
+            "ADDRESS": 2,
+            "BAN": 3,
+            "CREDIT_CARD": 4,
+            "EMAIL_ADDRESS": 5,
+            "UUID": 6,
+            "HASH_OR_KEY": 7,
+            "IPV4": 8,
+            "IPV6": 9,
+            "MAC_ADDRESS": 10,
+            "NAME": 11,  # SAME AS PERSON
+            "PERSON": 11,
+            "PHONE_NUMBER": 12,
+            "SSN": 13,
+            "URL": 14,
+            "DATETIME": 15,
+            "INTEGER_BIG": 16,  # SAME AS INTEGER
+        }
+        cls.model_path = "project/example/path/fake_model.h5"
+
+    def test_init(self, *mocks):
+
+        # load default
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+        expected_labels = [
+            "PAD",
+            "UNKNOWN",
+            "ADDRESS",
+            "BAN",
+            "CREDIT_CARD",
+            "EMAIL_ADDRESS",
+            "UUID",
+            "HASH_OR_KEY",
+            "IPV4",
+            "IPV6",
+            "MAC_ADDRESS",
+            "PERSON",
+            "PHONE_NUMBER",
+            "SSN",
+            "URL",
+            "DATETIME",
+            "INTEGER_BIG",
+        ]
+
+        self.assertDictEqual(self.label_mapping, model.label_mapping)
+        self.assertEqual(self.model_path, model._parameters['model_path'])
+        self.assertListEqual(expected_labels, model.labels)
+
+    def test_reverse_label_mapping(self, *mocks):
+
+        # load default
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+
+        # should notice that CITY does not exist in reverse
+        expected_reverse_label_mapping = {
+            0: "PAD",
+            1: "UNKNOWN",
+            2: "ADDRESS",
+            3: "BAN",
+            4: "CREDIT_CARD",
+            5: "EMAIL_ADDRESS",
+            6: "UUID",
+            7: "HASH_OR_KEY",
+            8: "IPV4",
+            9: "IPV6",
+            10: "MAC_ADDRESS",
+            11: "PERSON",
+            12: "PHONE_NUMBER",
+            13: "SSN",
+            14: "URL",
+            15: "DATETIME",
+            16: "INTEGER_BIG",
+        }
+
+        self.assertDictEqual(
+            expected_reverse_label_mapping,
+            model.reverse_label_mapping
+        )
+
+    def test_set_label_mapping(self, *mocks):
+
+        # load default
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+
+        # test not dict
+        label_mapping = None
+        with self.assertRaisesRegex(
+            TypeError,
+            "Labels must either be a non-empty encoding dict "
+            "which maps labels to index encodings or a list.",
+        ):
+            model.set_label_mapping(label_mapping)
+
+        # test label_mapping without PAD
+        label_mapping = {
+            "CITY": 1,  # SAME AS UNKNOWN
+            "UNKNOWN": 1,
+            "ADDRESS": 2,
+        }
+        model.set_label_mapping(label_mapping)
+        label_mapping["PAD"] = 0
+        self.assertDictEqual(label_mapping, model.label_mapping)
+
+        # test list without pad sets PAD: 0
+        labels = [
+            "UNKNOWN",
+            "ADDRESS",
+        ]
+        label_mapping = {
+            "PAD": 1,
+            "UNKNOWN": 2,
+            "ADDRESS": 3,
+        }
+        model.set_label_mapping(labels)
+        self.assertDictEqual(label_mapping, model.label_mapping)
+
+        # test label_mapping with PAD: 0
+        label_mapping = {
+            "PAD": 0,
+            "CITY": 1,  # SAME AS UNKNOWN
+            "UNKNOWN": 1,
+            "ADDRESS": 2,
+        }
+        model.set_label_mapping(label_mapping)
+        self.assertDictEqual(label_mapping, model.label_mapping)
+
+        # test if pad not set, but 0 taken set to last ind
+        # test label_mapping without PAD
+        label_mapping = {
+            "CITY": 0,
+            "UNKNOWN": 1,
+            "ADDRESS": 2,
+        }
+        model.set_label_mapping(label_mapping)
+        label_mapping["PAD"] = 3
+        self.assertDictEqual(label_mapping, model.label_mapping)
+
+    def test_predict(self, *mocks):
+        # model
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+        data_gen = [np.array([[1, 3], [1, 2]])]
+        result = model.predict(data_gen)
+        self.assertIn("pred", result)
+        self.assertEqual((2, 2), np.array(result['pred']).shape)
+
+        result = model.predict(data_gen, show_confidences=True)
+        self.assertIn("pred", result)
+        self.assertIn("conf", result)
+        self.assertEqual(
+            (2, 2, model.num_labels),
+            np.array(result['conf']).shape
+        )
+
+    def test_fit_and_predict(self, *mocks):
+        # model
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+
+        # data for model
+        data_gen = [
+            [
+                np.array([[1, 3], [1, 2]]),  # x_data
+                np.zeros((2, 2, model.num_labels)),  # y_data
+            ]
+        ]
+        cv_gen = data_gen
+
+        # Basic Fit with Validation Data
+        with self.assertLogs(
+            "DataProfiler.labelers.char_load_tf_model", level="INFO"
+        ) as logs:
+            history, f1, f1_report = model.fit(data_gen, cv_gen, reset_weights=True)
+
+        # Ensure info was logged during fit
+        self.assertTrue(len(logs.output))
+
+        data_gen = [np.array([[1, 3], [1, 2]])]
+        model.predict(data_gen)
+
+        # fit with new labels
+        new_label_mapping = {
+            "PAD": 0,
+            "TEST": 1,
+            "NEW": 2,
+            "MAPPING": 3,
+            model._parameters['default_label']: 4,
+        }
+        data_gen = [
+            [
+                np.array([[1, 3], [1, 2]]),  # x_data
+                np.zeros((2, 2, len(new_label_mapping))),  # y_data
+            ]
+        ]
+        history, f1, f1_report = model.fit(
+            data_gen, cv_gen, label_mapping=new_label_mapping
+        )
+
+        # predict after fitting on just the text
+        model.predict(data_gen[0][0])
+
+    @mock.patch("os.makedirs", return_value=None)
+    def test_validation_evaluate_and_classification_report(self, *mocks):
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+        model._construct_model()  # must make model to do priv validate func
+
+        # validation data
+        val_gen = [
+            [
+                np.ones((2, 20)),  # x_data
+                np.zeros((2, 20, model.num_labels)),  # y_data
+            ]
+        ]
+        val_gen[0][1][0, :11, self.label_mapping["ADDRESS"]] = 1
+
+        f1, f1_report = model._validate_training(val_gen, 32, True, True)
+        self.assertIsNotNone(f1)
+        self.assertIsNotNone(f1_report)
+        self.assertEqual(11, f1_report["ADDRESS"]["support"])
+
+    def test_param_validation(self, *mocks):
+        # Make sure all parameters can be altered. Make sure non-valid params
+        # are caught
+        parameters = {
+            "default_label": "UNKNOWN",
+        }
+        invalid_parameters = {
+            "fake_extra_param": "fails",
+        }
+        model = CharLoadTFModel(
+            self.model_path,
+            label_mapping=self.label_mapping,
+            parameters=parameters
+        )
+        model._construct_model()
+        self.assertDictEqual(parameters, model._parameters)
+        with self.assertRaises(ValueError):
+            CharLoadTFModel(
+                self.model_path,
+                label_mapping=self.label_mapping,
+                parameters=invalid_parameters
+            )
+
+    @mock.patch("sys.stdout", new_callable=StringIO)
+    def test_help(self, mock_stdout, *mocks):
+        CharLoadTFModel.help()
+        self.assertIn("CharLoadTFModel", mock_stdout.getvalue())
+        self.assertIn("Parameters", mock_stdout.getvalue())
+
+    @mock.patch("tensorflow.keras.Model.save", return_value=None)
+    @mock.patch("builtins.open")
+    def test_save(self, mock_open, mock_tf_save, *mocks):
+        # setup mock
+        mock_file = setup_save_mock_open(mock_open)
+
+        # Save and load a CNN Model with custom parameters
+        parameters = {}
+        label_mapping = mock_label_mapping
+        model = CharLoadTFModel(self.model_path, label_mapping, parameters)
+
+        # save file and test
+        save_path = "./fake/path"
+        model.save_to_disk(save_path)
+        self.assertEqual(
+            # model parameters
+            '{"default_label": "UNKNOWN", "pad_label": "PAD"}'
+            # label_mapping
+            '{"PAD": 0, "CITY": 1, "UNKNOWN": 1, "ADDRESS": 2}',
+            mock_file.getvalue(),
+        )
+        mock_tf_save.assert_called_with(save_path)
+
+        # close mock
+        StringIO.close(mock_file)
+
+    @mock.patch("tensorflow.keras.Model.save", return_value=None)
+    @mock.patch("builtins.open", side_effect=mock_open)
+    def test_load(self, *mocks):
+        dir = "fake/path/"
+        loaded_model = CharLoadTFModel.load_from_disk(dir)
+        self.assertIsInstance(loaded_model, CharLoadTFModel)
+
+    @mock.patch("sys.stdout", new_callable=StringIO)
+    def test_model_details(self, mock_stdout, *mocks):
+        # Default Model Construct
+        model = CharLoadTFModel(self.model_path, self.label_mapping)
+
+        # Test Details
+        model.details()
+        self.assertIn("input", mock_stdout.getvalue())
+        self.assertIn("dense", mock_stdout.getvalue())
+        self.assertIn("softmax_output", mock_stdout.getvalue())
+        self.assertIn("Total params", mock_stdout.getvalue())
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4f2fd20d941a90387650227b4b05b0a463a873d7 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 17:08:19 -0500
Subject: [PATCH 09/14] fix: remove unneeded file

---
 .../labelers/pre_encoded_char_cnn_model.py    | 839 ------------------
 1 file changed, 839 deletions(-)
 delete mode 100644 dataprofiler/labelers/pre_encoded_char_cnn_model.py

diff --git a/dataprofiler/labelers/pre_encoded_char_cnn_model.py b/dataprofiler/labelers/pre_encoded_char_cnn_model.py
deleted file mode 100644
index a8ecb96de..000000000
--- a/dataprofiler/labelers/pre_encoded_char_cnn_model.py
+++ /dev/null
@@ -1,839 +0,0 @@
-import json
-import copy
-import os
-import sys
-import time
-import logging
-from collections import defaultdict
-import functools
-
-import tensorflow as tf
-import numpy as np
-from sklearn import decomposition
-
-from . import labeler_utils
-from .base_model import BaseModel, BaseTrainableModel
-from .base_model import AutoSubRegistrationMeta
-from .. import dp_logging
-
-_file_dir = os.path.dirname(os.path.abspath(__file__))
-
-logger = dp_logging.get_child_logger(__name__)
-
-
-class NoV1ResourceMessageFilter(logging.Filter):
-    """Removes TF2 warning for using TF1 model which has resources."""
-    def filter(self, record):
-        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
-            'running tf.compat.v1.enable_resource_variables()'
-        return msg not in record.getMessage()
-
-
-tf_logger = logging.getLogger('tensorflow')
-tf_logger.addFilter(NoV1ResourceMessageFilter())
-
-
-def protected_register_keras_serializable(package='Custom', name=None):
-    """
-    Protects against already registered keras serializable layers. This
-    ensures that if it was already registered, it will not try to register it
-    again.
-    """
-    def decorator(arg):
-        """Protects against double registration of a keras layer."""
-        class_name = name if name is not None else arg.__name__
-        registered_name = package + '>' + class_name
-        if tf.keras.utils.get_registered_object(registered_name) is None:
-            tf.keras.utils.register_keras_serializable(package, name)(arg)
-        return arg
-    return decorator
-
-
-@protected_register_keras_serializable()
-class FBetaScore(tf.keras.metrics.Metric):
-    r"""Computes F-Beta score.
-    Adapted and slightly modified from https://github.com/tensorflow/addons/blob/v0.12.0/tensorflow_addons/metrics/f_scores.py#L211-L283
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the weighted harmonic mean of precision
-    and recall. Output range is `[0, 1]`. Works for
-    both multi-class and multi-label classification.
-    $$
-    F_{\beta} = (1 + \beta^2) * \frac{\textrm{precision} * \textrm{precision}}{(\beta^2 \cdot \textrm{precision}) + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro` and
-            `weighted`. Default value is None.
-        beta: Determines the weight of precision and recall
-            in harmonic mean. Determines the weight given to the
-            precision and recall. Default value is 1.
-        threshold: Elements of `y_pred` greater than threshold are
-            converted to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-Beta Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, beta=1.0, threshold=None,
-                 name="fbeta_score", dtype=None, **kwargs):
-        super().__init__(name=name, dtype=dtype)
-
-        if average not in (None, "micro", "macro", "weighted"):
-            raise ValueError(
-                "Unknown average type. Acceptable values "
-                "are: [None, 'micro', 'macro', 'weighted']"
-            )
-
-        if not isinstance(beta, float):
-            raise TypeError("The value of beta should be a python float")
-
-        if beta <= 0.0:
-            raise ValueError("beta value should be greater than zero")
-
-        if threshold is not None:
-            if not isinstance(threshold, float):
-                raise TypeError("The value of threshold should be a python float")
-            if threshold > 1.0 or threshold <= 0.0:
-                raise ValueError("threshold should be between 0 and 1")
-
-        self.num_classes = num_classes
-        self.average = average
-        self.beta = beta
-        self.threshold = threshold
-        self.axis = None
-        self.init_shape = []
-
-        if self.average != "micro":
-            self.axis = 0
-            self.init_shape = [self.num_classes]
-
-        def _zero_wt_init(name):
-            return self.add_weight(
-                name, shape=self.init_shape, initializer="zeros", dtype=self.dtype
-            )
-
-        self.true_positives = _zero_wt_init("true_positives")
-        self.false_positives = _zero_wt_init("false_positives")
-        self.false_negatives = _zero_wt_init("false_negatives")
-        self.weights_intermediate = _zero_wt_init("weights_intermediate")
-
-    def update_state(self, y_true, y_pred, sample_weight=None):
-        if self.threshold is None:
-            threshold = tf.reduce_max(y_pred, axis=-1, keepdims=True)
-            # make sure [0, 0, 0] doesn't become [1, 1, 1]
-            # Use abs(x) > eps, instead of x != 0 to check for zero
-            y_pred = tf.logical_and(y_pred >= threshold, tf.abs(y_pred) > 1e-12)
-        else:
-            y_pred = y_pred > self.threshold
-
-        y_true = tf.cast(y_true, self.dtype)
-        y_pred = tf.cast(y_pred, self.dtype)
-
-        def _weighted_sum(val, sample_weight):
-            if sample_weight is not None:
-                val = tf.math.multiply(val, tf.expand_dims(sample_weight, 1))
-            return tf.reduce_sum(val, axis=self.axis)
-
-        self.true_positives.assign_add(_weighted_sum(y_pred * y_true, sample_weight))
-        self.false_positives.assign_add(
-            _weighted_sum(y_pred * (1 - y_true), sample_weight)
-        )
-        self.false_negatives.assign_add(
-            _weighted_sum((1 - y_pred) * y_true, sample_weight)
-        )
-        self.weights_intermediate.assign_add(_weighted_sum(y_true, sample_weight))
-
-    def result(self):
-        precision = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_positives
-        )
-        recall = tf.math.divide_no_nan(
-            self.true_positives, self.true_positives + self.false_negatives
-        )
-
-        mul_value = precision * recall
-        add_value = (tf.math.square(self.beta) * precision) + recall
-        mean = tf.math.divide_no_nan(mul_value, add_value)
-        f1_score = mean * (1 + tf.math.square(self.beta))
-
-        if self.average == "weighted":
-            weights = tf.math.divide_no_nan(
-                self.weights_intermediate, tf.reduce_sum(self.weights_intermediate)
-            )
-            f1_score = tf.reduce_sum(f1_score * weights)
-
-        elif self.average is not None:  # [micro, macro]
-            f1_score = tf.reduce_mean(f1_score)
-
-        return f1_score
-
-    def get_config(self):
-        """Returns the serializable config of the metric."""
-
-        config = {
-            "num_classes": self.num_classes,
-            "average": self.average,
-            "beta": self.beta,
-            "threshold": self.threshold,
-        }
-
-        base_config = super().get_config()
-        return {**base_config, **config}
-
-    def reset_states(self):
-        reset_value = tf.zeros(self.init_shape, dtype=self.dtype)
-        tf.keras.backend.batch_set_value([(v, reset_value) for v in self.variables])
-
-
-@protected_register_keras_serializable()
-class F1Score(FBetaScore):
-    r"""Computes F-1 Score.
-
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #     https://github.com/tensorflow/addons/blob/v0.12.0/LICENSE
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ==============================================================================
-
-    It is the harmonic mean of precision and recall.
-    Output range is `[0, 1]`. Works for both multi-class
-    and multi-label classification.
-    $$
-    F_1 = 2 \cdot \frac{\textrm{precision} \cdot \textrm{recall}}{\textrm{precision} + \textrm{recall}}
-    $$
-    Args:
-        num_classes: Number of unique classes in the dataset.
-        average: Type of averaging to be performed on data.
-            Acceptable values are `None`, `micro`, `macro`
-            and `weighted`. Default value is None.
-        threshold: Elements of `y_pred` above threshold are
-            considered to be 1, and the rest 0. If threshold is
-            None, the argmax is converted to 1, and the rest 0.
-        name: (Optional) String name of the metric instance.
-        dtype: (Optional) Data type of the metric result.
-    Returns:
-        F-1 Score: float.
-    """
-
-    # Modification: remove the run-time type checking for functions
-    def __init__(self, num_classes, average=None, threshold=None,
-                 name="f1_score", dtype=None):
-        super().__init__(num_classes, average, 1.0, threshold, name=name, dtype=dtype)
-
-    def get_config(self):
-        base_config = super().get_config()
-        del base_config["beta"]
-        return base_config
-
-
-class PreEncodedCharCnnModel(BaseTrainableModel,
-                             metaclass=AutoSubRegistrationMeta):
-
-    # boolean if the label mapping requires the mapping for index 0 reserved
-    requires_zero_mapping = True
-
-    def __init__(self, label_mapping=None, parameters=None):
-        """
-        CNN Model Initializer. initialize epoch_id
-
-        :param label_mapping: maps labels to their encoded integers
-        :type label_mapping: dict
-        :param parameters: Contains all the appropriate parameters for the
-            model. Must contain num_labels. Other possible parameters are:
-                max_length, max_char_encoding_id, dim_embed, size_fc
-                dropout, size_conv, num_fil, optimizer, default_label
-        :type parameters: dict
-        :return: None
-        """
-
-        # parameter initialization
-        if not parameters:
-            parameters = {}
-        parameters.setdefault('max_length', 1014)
-        parameters.setdefault('alphabet_size', 69)
-        parameters.setdefault('dim_embed', 32)
-        parameters.setdefault('conv_layers', [
-          [256, 7,  1],
-          [256, 7,  1],
-          [256, 3, -1],
-          [256, 3, -1],
-          [256, 3, -1],
-          [256, 3,  1]
-        ])
-        parameters.setdefault('size_fc', [512, 512])
-        parameters.setdefault('dropout', 0.5)
-        parameters.setdefault('threshold', 1e-6)
-        parameters.setdefault('default_label', "UNKNOWN")
-        parameters['pad_label'] = 'PAD'
-        self._epoch_id = 0
-
-        # reconstruct flags for model
-        self._model_num_labels = 0
-        self._model_default_ind = -1
-
-        BaseModel.__init__(self, label_mapping, parameters)
-
-    def __eq__(self, other):
-        """
-        Checks if two models are equal with one another, may only check
-        important variables, i.e. may not check model itself.
-
-        :param self: a model
-        :param other: a model
-        :type self: BaseModel
-        :type other: BaseModel
-        :return: Whether or not self and other are equal
-        :rtype: bool
-        """
-        if self._parameters != other._parameters \
-                or self._label_mapping != other._label_mapping:
-            return False
-        return True
-
-    def _validate_parameters(self, parameters):
-        """
-        Validate the parameters sent in. Raise error if invalid parameters are
-        present.
-
-        :param parameters: parameter dict containing the following parameters:
-            max_length: Maximum char length in a sample
-            max_char_encoding_id: Maximum integer value for encoding the input
-            dim_embed: Number of embedded dimensions
-            size_fc: Size of each fully connected layers
-            dropout: Ratio of dropout in the model
-            size_conv: Convolution kernel size
-            default_label: Key for label_mapping that is the default label
-            pad_label: Key for entities_dict that is the pad label
-            num_fil: Number of filters in each convolution layer
-        :type parameters: dict
-        :return: None
-        """
-        errors = []
-        list_of_necessary_params = ['max_length', 'alphabet_size',
-                                    'dim_embed', 'size_fc', 'dropout',
-                                    'threshold', 'conv_layers', 'default_label',
-                                     'pad_label']
-        # Make sure the necessary parameters are present and valid.
-        for param in parameters:
-            if param in ['max_length', 'alphabet_size', 'dim_embed']:
-                if not isinstance(parameters[param], (int, float)) \
-                        or parameters[param] < 0:
-                    errors.append(param + " must be a valid integer or float "
-                                          "greater than 0.")
-            elif param in ['dropout', 'threshold']:
-                if not isinstance(parameters[param], (int, float)) \
-                        or parameters[param] < 0 or parameters[param] > 1:
-                    errors.append(param + " must be a valid integer or float "
-                                          "from 0 to 1.")
-            elif param == 'size_fc':
-                if not isinstance(parameters[param], list) \
-                        or len(parameters[param]) == 0:
-                    errors.append(param + " must be a non-empty list of "
-                                          "integers.")
-                else:
-                    for item in parameters[param]:
-                        if not isinstance(item, int):
-                            errors.append(param + " must be a non-empty "
-                                                  "list of integers.")
-                            break
-            elif param == 'conv_layers':
-                is_bad_conv_layers = True
-                if isinstance(parameters[param], list):
-                    is_bad_conv_layers = False
-                    for layer in parameters[param]:
-                        if (not isinstance(layer, list) or len(layer) != 3
-                                or any([not isinstance(x, int) for x in  layer])):
-                            is_bad_conv_layers = True
-                if is_bad_conv_layers:
-                    errors.append(param + " must be a non-empty list of "
-                                          "tuples containing 3 integers.")
-            elif param == 'default_label':
-                if not isinstance(parameters[param], str):
-                    error = str(param) + " must be a string."
-                    errors.append(error)
-
-        # Error if there are extra parameters thrown in
-        for param in parameters:
-            if param not in list_of_necessary_params:
-                errors.append(param + " is not an accepted parameter.")
-        if errors:
-            raise ValueError('\n'.join(errors))
-
-    def set_label_mapping(self, label_mapping):
-        """
-        Sets the labels for the model
-
-        :param label_mapping: label mapping of the model
-        :type label_mapping: dict
-        :return: None
-        """
-        if not isinstance(label_mapping, (list, dict)):
-            raise TypeError("Labels must either be a non-empty encoding dict "
-                            "which maps labels to index encodings or a list.")
-
-        label_mapping = copy.deepcopy(label_mapping)
-        if 'PAD' not in label_mapping:
-            if isinstance(label_mapping, list):  # if list missing PAD
-                label_mapping = ['PAD'] + label_mapping
-            elif 0 not in label_mapping.values():  # if dict missing PAD and 0
-                label_mapping.update({'PAD': 0})
-        if (isinstance(label_mapping, dict)
-                and label_mapping.get('PAD', None) != 0):  # dict with bad PAD
-            raise ValueError("`PAD` must map to index zero.")
-        if self._parameters['default_label'] not in label_mapping:
-            raise ValueError("The `default_label` of {} must exist in the "
-                             "label mapping.".format(
-                                self._parameters['default_label']))
-        super().set_label_mapping(label_mapping)
-
-    def _need_to_reconstruct_model(self):
-        """
-        Determines whether or not the model needs to be reconstructed.
-
-        :return: bool of whether or not the model needs to reconstruct.
-        """
-        if not self._model:
-            return False
-        default_ind = self.label_mapping[self._parameters['default_label']]
-        return self.num_labels != self._model_num_labels or \
-            default_ind != self._model_default_ind
-
-    def save_to_disk(self, dirpath):
-        """
-        Saves whole model to disk with weights
-
-        :param dirpath: directory path where you want to save the model to
-        :type dirpath: str
-        :return: None
-        """
-        if not self._model:
-            self._construct_model()
-        elif self._need_to_reconstruct_model():
-            self._reconstruct_model()
-
-        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
-        with open(model_param_dirpath, 'w') as fp:
-            json.dump(self._parameters, fp)
-        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
-        with open(labels_dirpath, 'w') as fp:
-            json.dump(self.label_mapping, fp)
-        self._model.save(os.path.join(dirpath))
-
-    @classmethod
-    def load_from_disk(cls, dirpath):
-        """
-        Loads whole model from disk with weights
-
-        :param dirpath: directory path where you want to load the model from
-        :type dirpath: str
-        :return: None
-        """
-
-        # load parameters
-        model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
-        with open(model_param_dirpath, 'r') as fp:
-            parameters = json.load(fp)
-
-        # load label_mapping
-        labels_dirpath = os.path.join(dirpath, "label_mapping.json")
-        with open(labels_dirpath, 'r') as fp:
-            label_mapping = json.load(fp)
-
-        # use f1 score metric
-        custom_objects = {
-            "F1Score": F1Score(
-                num_classes=max(label_mapping.values()) + 1,
-                average='micro'),
-            "CharacterLevelCnnModel": cls,
-        }
-        with tf.keras.utils.custom_object_scope(custom_objects):
-            tf_model = tf.keras.models.load_model(dirpath)
-
-        loaded_model = cls(label_mapping, parameters)
-        loaded_model._model = tf_model
-        #
-        # # Tensorflow v1 Model weights need to be transferred.
-        # if not callable(tf_model):
-        #     loaded_model._construct_model()
-        #     tf1_weights = []
-        #     for var in tf_model.variables:
-        #         if 'training' not in var.name:
-        #             tf1_weights.append(var.value())
-        #
-        #     loaded_model._construct_model()
-        #     tf1_weights.append(loaded_model._model.weights[-1].value())
-        #     loaded_model._model.set_weights(tf1_weights)
-
-        # load self
-        loaded_model._model_num_labels = loaded_model.num_labels
-        loaded_model._model_default_ind = loaded_model.label_mapping[
-            loaded_model._parameters['default_label']
-        ]
-        return loaded_model
-
-    def _construct_model(self):
-        """
-        Model constructor for the data labeler. This also serves as a weight
-        reset.
-
-        :return: None
-        """
-        num_labels = self.num_labels
-        default_ind = self.label_mapping[self._parameters['default_label']]
-
-        # default parameters
-        max_length = self._parameters['max_length']
-        alphabet_size = self._parameters['alphabet_size']
-        dim_embed = self._parameters['dim_embed']
-        conv_layers = self._parameters['conv_layers']
-        size_fc = self._parameters['size_fc']
-        threshold = self._parameters['threshold']
-        dropout = self._parameters['dropout']
-
-        # Reset model
-        tf.keras.backend.clear_session()
-
-         # Input layer
-        inputs = tf.keras.layers.Input(
-            shape=(None,), name='sent_input', dtype='int64')
-        # Embedding layers
-        x_embedding = tf.keras.layers.Embedding(
-            alphabet_size + 1, dim_embed, input_length=max_length)(inputs)
-
-        # Convolution layers
-        x = x_embedding
-        for cl in conv_layers:
-            x = tf.keras.layers.Convolution1D(cl[0], cl[1], padding='same')(x)
-            x = tf.keras.layers.ThresholdedReLU(threshold)(x)
-            if cl[2] != -1:
-                x = tf.keras.layers.MaxPooling1D(cl[2])(x)
-        # x = tf.keras.layers.Flatten()(x)
-
-        # Fully connected layers
-        for fl in size_fc:
-            x_dense = tf.keras.layers.Dense(fl)(x)
-            x = tf.keras.layers.ThresholdedReLU(threshold)(x_dense)
-            x = tf.keras.layers.Dropout(dropout)(x)
-
-        # Output layer
-        predictions = tf.keras.layers.Dense(
-            num_labels, activation='softmax', name='softmax_output')(x)
-        # argmax layer
-        argmax_layer = tf.keras.backend.argmax(predictions)
-
-        # Build and compile model
-        self._model = tf.keras.models.Model(
-            inputs=inputs, outputs=[predictions, argmax_layer])
-
-        # Compile the model w/ metrics
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
-
-        # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
-
-        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
-
-        self._epoch_id = 0
-        self._model_num_labels = num_labels
-        self._model_default_ind = default_ind
-
-    def reset_weights(self):
-        """
-        Reset the weights of the model.
-
-        :return: None
-        """
-        self._construct_model()
-
-    def _reconstruct_model(self):
-        """
-        Reconstruct the appropriate layers if the number of number of labels is
-        altered
-
-        :return: None
-        """
-
-        # Reset model
-        tf.keras.backend.clear_session()
-
-        num_labels = self.num_labels
-        default_ind = self.label_mapping[self._parameters['default_label']]
-
-        # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax')
-        for _ in range(2):
-            self._model.layers.pop()
-
-        # Add the final Softmax layer to the previous spot
-        final_softmax_layer = tf.keras.layers.Dense(
-            num_labels, activation='softmax', name="softmax_output")(
-            self._model.layers[-4].output)
-
-        # Output the model into a .pb file for TensorFlow
-        argmax_layer = tf.keras.backend.argmax(final_softmax_layer)
-
-
-        argmax_outputs = [final_softmax_layer, argmax_layer]
-        self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
-
-        # Compile the model
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
-        losses = {softmax_output_layer_name: "categorical_crossentropy"}
-
-        # use f1 score metric
-        f1_score_training = F1Score(num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
-
-        self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
-
-        self._epoch_id = 0
-        self._model_num_labels = num_labels
-        self._model_default_ind = default_ind
-
-    def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
-            reset_weights=False, verbose=True):
-        """
-        Train the current model with the training data and validation data
-
-        :param train_data: Training data used to train model
-        :type train_data: Union[list, np.ndarray]
-        :param val_data: Validation data used to validate the training
-        :type val_data: Union[list, np.ndarray]
-        :param batch_size: Used to determine number of samples in each batch
-        :type batch_size: int
-        :param label_mapping: maps labels to their encoded integers
-        :type label_mapping: Union[dict, None]
-        :param reset_weights: Flag to determine whether to reset the weights or
-            not
-        :type reset_weights: bool
-        :param verbose: Flag to determine whether to print status or not
-        :type verbose: bool
-        :return: None
-        """
-
-        if label_mapping is not None:
-            self.set_label_mapping(label_mapping)
-
-        if not self._model:
-            self._construct_model()
-        else:
-            if self._need_to_reconstruct_model():
-                self._reconstruct_model()
-            if reset_weights:
-                self.reset_weights()
-
-        history = defaultdict()
-        f1 = None
-        f1_report = []
-
-        self._model.reset_metrics()
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
-
-        start_time = time.time()
-        batch_id = 0
-        for x_train, y_train in train_data:
-            model_results = self._model.train_on_batch(
-                x_train, {softmax_output_layer_name: y_train})
-            sys.stdout.flush()
-            if verbose:
-                sys.stdout.write(
-                    "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - "
-                    "f1_score %f" %
-                    (self._epoch_id, batch_id, *model_results[1:]))
-            batch_id += 1
-
-        for i, metric_label in enumerate(self._model.metrics_names):
-            history[metric_label] = model_results[i]
-
-        if val_data:
-            f1, f1_report = self._validate_training(val_data)
-            history['f1_report'] = f1_report
-
-            val_f1 = f1_report['weighted avg']['f1-score'] \
-                if f1_report else np.NAN
-            val_precision = f1_report['weighted avg']['precision'] \
-                if f1_report else np.NAN
-            val_recall = f1_report['weighted avg']['recall'] \
-                if f1_report else np.NAN
-            epoch_time = time.time() - start_time
-            logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
-                        "val_f1: %f - val_precision: %f - val_recall %f" %
-                        (self._epoch_id, epoch_time, *model_results[1:],
-                         val_f1, val_precision, val_recall))
-
-        self._epoch_id += 1
-
-        return history, f1, f1_report
-
-    def _validate_training(self, val_data, batch_size_test=32,
-                           verbose_log=True, verbose_keras=False):
-        """
-        Validate the model on the test set and return the evaluation metrics.
-
-        :param val_data: data generator for the validation
-        :type val_data: iterator
-        :param batch_size_test: Number of samples to process in testing
-        :type batch_size_test: int
-        :param verbose_log: whether or not to print out scores for training,
-            etc.
-        :type verbose_log: bool
-        :param verbose_keras: whether or not to print out scores for training,
-            from keras.
-        :type verbose_keras: bool
-        return (f1-score, f1 report).
-        """
-        f1 = None
-        f1_report = None
-
-        if val_data is None:
-            return f1, f1_report
-
-        # Predict on the test set
-        batch_id = 0
-        y_val_pred = []
-        y_val_test = []
-        for x_val, y_val in val_data:
-            y_val_pred.append(self._model.predict(
-                x_val, batch_size=batch_size_test, verbose=verbose_keras)[1])
-            y_val_test.append(np.argmax(y_val, axis=-1))
-            batch_id += 1
-            sys.stdout.flush()
-            if verbose_log:
-                sys.stdout.write("\rEPOCH %g, validation_batch_id %d" %
-                                 (self._epoch_id, batch_id))
-
-        tf.keras.backend.set_floatx('float32')
-        # Clean the predicted entities and the actual entities
-        f1, f1_report = labeler_utils.evaluate_accuracy(
-            np.concatenate(y_val_pred, axis=0),
-            np.concatenate(y_val_test, axis=0),
-            self.num_labels,
-            self.reverse_label_mapping,
-            verbose=verbose_keras)
-
-        return f1, f1_report
-
-    def predict(self, data, batch_size=32, show_confidences=False,
-                verbose=True):
-        """
-        Run model and get predictions
-
-        :param data: text input
-        :type data: Union[list, numpy.ndarray]
-        :param batch_size: number of samples in the batch of data
-        :type batch_size: int
-        :param show_confidences: whether user wants prediction confidences
-        :type show_confidences:
-        :param verbose: Flag to determine whether to print status or not
-        :type verbose: bool
-        :return: char level predictions and confidences
-        :rtype: dict
-        """
-        if not self._model:
-            raise ValueError("You are trying to predict without a model. "
-                             "Construct/Load a model before predicting.")
-        elif self._need_to_reconstruct_model():
-            raise RuntimeError("The model label mapping definitions have been "
-                               "altered without additional training. Please "
-                               "train the model or reset the label mapping to "
-                               "predict.")
-        # Pre-allocate space for predictions
-        confidences = []
-        sentence_lengths = np.zeros((batch_size,), dtype=int)
-        predictions = np.zeros((batch_size, self._parameters['max_length']))
-        if show_confidences:
-            confidences = np.zeros((batch_size,
-                                    self._parameters['max_length'],
-                                    self.num_labels))
-
-        # Run model with batching
-        allocation_index = 0
-        for batch_id, batch_data in enumerate(data):
-            model_output = self._model(
-                tf.convert_to_tensor(batch_data)
-            )
-
-            # Count number of samples in batch to prevent array mismatch
-            num_samples_in_batch = len(batch_data)
-            allocation_index = batch_id * batch_size
-
-            # Double array size
-            if len(predictions) <= allocation_index:
-                predictions = np.pad(predictions, ((0, len(predictions)),
-                                                   (0, 0)), mode='constant')
-                sentence_lengths = np.pad(
-                    sentence_lengths, pad_width=((0, len(sentence_lengths)),),
-                    mode='constant')
-                if show_confidences:
-                    confidences = np.pad(confidences,
-                                         ((0, len(predictions)),
-                                          (0, 0), (0, 0)), mode='constant')
-
-            if show_confidences:
-                confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy()
-            predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy()
-            sentence_lengths[allocation_index:allocation_index + num_samples_in_batch] = list(map(lambda x: len(x), batch_data))
-
-            allocation_index += num_samples_in_batch
-
-        # Convert predictions, confidences to lists from numpy
-        predictions_list = [i for i in range(0, allocation_index)]
-        confidences_list = None
-        if show_confidences:
-            confidences_list = [i for i in range(0, allocation_index)]
-
-        # Append slices of predictions to return prediction & confidence matrices
-        for index, sentence_length \
-                in enumerate(sentence_lengths[:allocation_index]):
-            predictions_list[index] = list(predictions[index][:sentence_length])
-            if show_confidences:
-                confidences_list[index] = list(confidences[index][:sentence_length])
-
-        if show_confidences:
-            return {'pred': predictions_list, 'conf': confidences_list}
-        return {'pred': predictions_list}
-
-    def details(self):
-        """
-        Prints the relevant details of the model (summary, parameters, label
-        mapping)
-        """
-        print("\n###### Model Details ######\n")
-        self._model.summary()
-        print("\nModel Parameters:")
-        for key, value in self._parameters.items():
-            print("{}: {}".format(key, value))
-        print("\nModel Label Mapping:")
-        for key, value in self.label_mapping.items():
-            print("{}: {}".format(key, value))

From 9a17b74a531563c903d6d5f74885d70321fd278e Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 17:16:24 -0500
Subject: [PATCH 10/14] fix: f1score imports

---
 dataprofiler/tests/labelers/test_f_scores.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/tests/labelers/test_f_scores.py b/dataprofiler/tests/labelers/test_f_scores.py
index 241d3db50..e707dc1ea 100644
--- a/dataprofiler/tests/labelers/test_f_scores.py
+++ b/dataprofiler/tests/labelers/test_f_scores.py
@@ -22,7 +22,7 @@
 import numpy as np
 import tensorflow as tf
 
-from dataprofiler.labelers.character_level_cnn_model import F1Score, FBetaScore
+from dataprofiler.labelers.labeler_utils import F1Score, FBetaScore
 
 
 class TestFScore(unittest.TestCase):

From 0b7a44ff0385bea510695d4420ea1e1ea456b367 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Tue, 12 Jul 2022 17:32:32 -0500
Subject: [PATCH 11/14] fix: logger count

---
 dataprofiler/tests/labelers/test_labeler_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/tests/labelers/test_labeler_utils.py b/dataprofiler/tests/labelers/test_labeler_utils.py
index b352ebd9e..58c32a7a3 100644
--- a/dataprofiler/tests/labelers/test_labeler_utils.py
+++ b/dataprofiler/tests/labelers/test_labeler_utils.py
@@ -294,8 +294,8 @@ def test_get_tf_layer_index_from_name(self):
 
     def test_hide_tf_logger_warnings(self):
         logger = logging.getLogger('tensorflow')
-        self.assertListEqual([], logger.filters)
+        num_loggers = len(logger.filters)
 
         # make change and validate updated filter
         labeler_utils.hide_tf_logger_warnings()
-        self.assertEqual(1, len(logger.filters))
+        self.assertEqual(1 + num_loggers, len(logger.filters))

From ade62e40244aa04cd27eb7f37509f8d17b531f10 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Wed, 13 Jul 2022 09:36:28 -0500
Subject: [PATCH 12/14] fix: reformat with isort / black

---
 dataprofiler/labelers/char_load_tf_model.py | 220 +++++++++++---------
 1 file changed, 123 insertions(+), 97 deletions(-)

diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
index 5abc1fa96..01e2f6bb8 100644
--- a/dataprofiler/labelers/char_load_tf_model.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -1,17 +1,16 @@
-import json
 import copy
+import json
 import os
 import sys
 import time
 from collections import defaultdict
 
-import tensorflow as tf
 import numpy as np
+import tensorflow as tf
 
-from . import labeler_utils
-from .base_model import BaseModel, BaseTrainableModel
-from .base_model import AutoSubRegistrationMeta
 from .. import dp_logging
+from . import labeler_utils
+from .base_model import AutoSubRegistrationMeta, BaseModel, BaseTrainableModel
 
 _file_dir = os.path.dirname(os.path.abspath(__file__))
 
@@ -19,14 +18,12 @@
 labeler_utils.hide_tf_logger_warnings()
 
 
-class CharLoadTFModel(BaseTrainableModel,
-                      metaclass=AutoSubRegistrationMeta):
+class CharLoadTFModel(BaseTrainableModel, metaclass=AutoSubRegistrationMeta):
 
     # boolean if the label mapping requires the mapping for index 0 reserved
     requires_zero_mapping = False
 
-    def __init__(self, model_path, label_mapping=None,
-                 parameters=None):
+    def __init__(self, model_path, label_mapping=None, parameters=None):
         """
         Loadable TF Model Initializer.
 
@@ -45,9 +42,9 @@ def __init__(self, model_path, label_mapping=None,
         # parameter initialization
         if not parameters:
             parameters = {}
-        parameters.setdefault('default_label', "UNKNOWN")
-        parameters['model_path'] = model_path
-        parameters['pad_label'] = 'PAD'
+        parameters.setdefault("default_label", "UNKNOWN")
+        parameters["model_path"] = model_path
+        parameters["pad_label"] = "PAD"
         self._epoch_id = 0
 
         # reconstruct flags for model
@@ -68,8 +65,10 @@ def __eq__(self, other):
         :return: Whether or not self and other are equal
         :rtype: bool
         """
-        if self._parameters != other._parameters \
-                or self._label_mapping != other._label_mapping:
+        if (
+            self._parameters != other._parameters
+            or self._label_mapping != other._label_mapping
+        ):
             return False
         return True
 
@@ -92,12 +91,11 @@ def _validate_parameters(self, parameters):
         :return: None
         """
         errors = []
-        list_of_necessary_params = ['model_path', 'default_label',
-                                    'pad_label']
+        list_of_necessary_params = ["model_path", "default_label", "pad_label"]
 
         # Make sure the necessary parameters are present and valid.
         for param in parameters:
-            if param in ['default_label', 'model_path', 'pad_label']:
+            if param in ["default_label", "model_path", "pad_label"]:
                 if not isinstance(parameters[param], str):
                     error = str(param) + " must be a string."
                     errors.append(error)
@@ -107,7 +105,7 @@ def _validate_parameters(self, parameters):
             if param not in list_of_necessary_params:
                 errors.append(param + " is not an accepted parameter.")
         if errors:
-            raise ValueError('\n'.join(errors))
+            raise ValueError("\n".join(errors))
 
     def set_label_mapping(self, label_mapping):
         """
@@ -118,22 +116,24 @@ def set_label_mapping(self, label_mapping):
         :return: None
         """
         if not isinstance(label_mapping, (list, dict)):
-            raise TypeError("Labels must either be a non-empty encoding dict "
-                            "which maps labels to index encodings or a list.")
+            raise TypeError(
+                "Labels must either be a non-empty encoding dict "
+                "which maps labels to index encodings or a list."
+            )
 
         label_mapping = copy.deepcopy(label_mapping)
-        if 'PAD' not in label_mapping:
+        if "PAD" not in label_mapping:
             if isinstance(label_mapping, list):  # if list missing PAD
-                label_mapping = ['PAD'] + label_mapping
+                label_mapping = ["PAD"] + label_mapping
             elif 0 not in label_mapping.values():  # if dict missing PAD and 0
-                label_mapping.update({'PAD': 0})
+                label_mapping.update({"PAD": 0})
             else:
-                label_mapping.update(
-                    {'PAD': max(list(label_mapping.values())) + 1})
-        if self._parameters['default_label'] not in label_mapping:
-            raise ValueError("The `default_label` of {} must exist in the "
-                             "label mapping.".format(
-                                self._parameters['default_label']))
+                label_mapping.update({"PAD": max(list(label_mapping.values())) + 1})
+        if self._parameters["default_label"] not in label_mapping:
+            raise ValueError(
+                "The `default_label` of {} must exist in the "
+                "label mapping.".format(self._parameters["default_label"])
+            )
         super().set_label_mapping(label_mapping)
 
     def _need_to_reconstruct_model(self):
@@ -144,9 +144,11 @@ def _need_to_reconstruct_model(self):
         """
         if not self._model:
             return False
-        default_ind = self.label_mapping[self._parameters['default_label']]
-        return self.num_labels != self._model_num_labels or \
-            default_ind != self._model_default_ind
+        default_ind = self.label_mapping[self._parameters["default_label"]]
+        return (
+            self.num_labels != self._model_num_labels
+            or default_ind != self._model_default_ind
+        )
 
     def save_to_disk(self, dirpath):
         """
@@ -164,10 +166,10 @@ def save_to_disk(self, dirpath):
         model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
         model_parameters = self._parameters.copy()
         model_parameters.pop("model_path")
-        with open(model_param_dirpath, 'w') as fp:
+        with open(model_param_dirpath, "w") as fp:
             json.dump(model_parameters, fp)
         labels_dirpath = os.path.join(dirpath, "label_mapping.json")
-        with open(labels_dirpath, 'w') as fp:
+        with open(labels_dirpath, "w") as fp:
             json.dump(self.label_mapping, fp)
         self._model.save(dirpath)
 
@@ -183,19 +185,19 @@ def load_from_disk(cls, dirpath):
 
         # load parameters
         model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
-        with open(model_param_dirpath, 'r') as fp:
+        with open(model_param_dirpath, "r") as fp:
             parameters = json.load(fp)
 
         # load label_mapping
         labels_dirpath = os.path.join(dirpath, "label_mapping.json")
-        with open(labels_dirpath, 'r') as fp:
+        with open(labels_dirpath, "r") as fp:
             label_mapping = json.load(fp)
 
         # use f1 score metric
         custom_objects = {
             "F1Score": labeler_utils.F1Score(
-                num_classes=max(label_mapping.values()) + 1,
-                average='micro'),
+                num_classes=max(label_mapping.values()) + 1, average="micro"
+            ),
             "CharLoadTFModel": cls,
         }
         with tf.keras.utils.custom_object_scope(custom_objects):
@@ -207,7 +209,7 @@ def load_from_disk(cls, dirpath):
         # load self
         loaded_model._model_num_labels = loaded_model.num_labels
         loaded_model._model_default_ind = loaded_model.label_mapping[
-            loaded_model._parameters['default_label']
+            loaded_model._parameters["default_label"]
         ]
         return loaded_model
 
@@ -219,37 +221,38 @@ def _construct_model(self):
         :return: None
         """
         num_labels = self.num_labels
-        default_ind = self.label_mapping[self._parameters['default_label']]
-        model_loc = self._parameters['model_path']
+        default_ind = self.label_mapping[self._parameters["default_label"]]
+        model_loc = self._parameters["model_path"]
 
         self._model = tf.keras.models.load_model(model_loc)
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        softmax_output_layer_name = self._model.outputs[0].name.split("/")[0]
         softmax_layer_ind = labeler_utils.get_tf_layer_index_from_name(
-            self._model, softmax_output_layer_name)
+            self._model, softmax_output_layer_name
+        )
         softmax_layer = self._model.get_layer(softmax_output_layer_name)
         prev_softmax_layer = softmax_layer.input
 
         new_softmax_layer = softmax_layer.output
         if softmax_layer.weights[0].shape[-1] != num_labels:
             new_softmax_layer = tf.keras.layers.Dense(
-                num_labels, activation='softmax', name="softmax_output")(
-                self._model.layers[softmax_layer_ind - 1].output)
+                num_labels, activation="softmax", name="softmax_output"
+            )(self._model.layers[softmax_layer_ind - 1].output)
 
         # Output the model into a .pb file for TensorFlow
         argmax_layer = tf.keras.backend.argmax(new_softmax_layer)
 
-
         argmax_outputs = [new_softmax_layer, argmax_layer]
         self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
 
         # Compile the model w/ metrics
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        softmax_output_layer_name = self._model.outputs[0].name.split("/")[0]
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+            num_classes=num_labels, average="micro"
+        )
+        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
 
@@ -277,7 +280,7 @@ def _reconstruct_model(self):
         tf.keras.backend.clear_session()
 
         num_labels = self.num_labels
-        default_ind = self.label_mapping[self._parameters['default_label']]
+        default_ind = self.label_mapping[self._parameters["default_label"]]
 
         # Remove the 2 output layers ('softmax', 'tf_op_layer_ArgMax')
         for _ in range(2):
@@ -285,24 +288,24 @@ def _reconstruct_model(self):
 
         # Add the final Softmax layer to the previous spot
         final_softmax_layer = tf.keras.layers.Dense(
-            num_labels, activation='softmax', name="softmax_output")(
-            self._model.layers[-4].output)
+            num_labels, activation="softmax", name="softmax_output"
+        )(self._model.layers[-4].output)
 
         # Output the model into a .pb file for TensorFlow
         argmax_layer = tf.keras.backend.argmax(final_softmax_layer)
 
-
         argmax_outputs = [final_softmax_layer, argmax_layer]
         self._model = tf.keras.Model(self._model.inputs, argmax_outputs)
 
         # Compile the model
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        softmax_output_layer_name = self._model.outputs[0].name.split("/")[0]
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
         # use f1 score metric
         f1_score_training = labeler_utils.F1Score(
-            num_classes=num_labels, average='micro')
-        metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
+            num_classes=num_labels, average="micro"
+        )
+        metrics = {softmax_output_layer_name: ["acc", f1_score_training]}
 
         self._model.compile(loss=losses, optimizer="adam", metrics=metrics)
 
@@ -310,8 +313,15 @@ def _reconstruct_model(self):
         self._model_num_labels = num_labels
         self._model_default_ind = default_ind
 
-    def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
-            reset_weights=False, verbose=True):
+    def fit(
+        self,
+        train_data,
+        val_data=None,
+        batch_size=32,
+        label_mapping=None,
+        reset_weights=False,
+        verbose=True,
+    ):
         """
         Train the current model with the training data and validation data
 
@@ -347,19 +357,20 @@ def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
         f1_report = []
 
         self._model.reset_metrics()
-        softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
+        softmax_output_layer_name = self._model.outputs[0].name.split("/")[0]
 
         start_time = time.time()
         batch_id = 0
         for x_train, y_train in train_data:
             model_results = self._model.train_on_batch(
-                x_train, {softmax_output_layer_name: y_train})
+                x_train, {softmax_output_layer_name: y_train}
+            )
             sys.stdout.flush()
             if verbose:
                 sys.stdout.write(
                     "\rEPOCH %d, batch_id %d: loss: %f - acc: %f - "
-                    "f1_score %f" %
-                    (self._epoch_id, batch_id, *model_results[1:]))
+                    "f1_score %f" % (self._epoch_id, batch_id, *model_results[1:])
+                )
             batch_id += 1
 
         for i, metric_label in enumerate(self._model.metrics_names):
@@ -367,26 +378,34 @@ def fit(self, train_data, val_data=None, batch_size=32, label_mapping=None,
 
         if val_data:
             f1, f1_report = self._validate_training(val_data)
-            history['f1_report'] = f1_report
-
-            val_f1 = f1_report['weighted avg']['f1-score'] \
-                if f1_report else np.NAN
-            val_precision = f1_report['weighted avg']['precision'] \
-                if f1_report else np.NAN
-            val_recall = f1_report['weighted avg']['recall'] \
-                if f1_report else np.NAN
+            history["f1_report"] = f1_report
+
+            val_f1 = f1_report["weighted avg"]["f1-score"] if f1_report else np.NAN
+            val_precision = (
+                f1_report["weighted avg"]["precision"] if f1_report else np.NAN
+            )
+            val_recall = f1_report["weighted avg"]["recall"] if f1_report else np.NAN
             epoch_time = time.time() - start_time
-            logger.info("\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
-                        "val_f1: %f - val_precision: %f - val_recall %f" %
-                        (self._epoch_id, epoch_time, *model_results[1:],
-                         val_f1, val_precision, val_recall))
+            logger.info(
+                "\rEPOCH %d (%ds), loss: %f - acc: %f - f1_score %f -- "
+                "val_f1: %f - val_precision: %f - val_recall %f"
+                % (
+                    self._epoch_id,
+                    epoch_time,
+                    *model_results[1:],
+                    val_f1,
+                    val_precision,
+                    val_recall,
+                )
+            )
 
         self._epoch_id += 1
 
         return history, f1, f1_report
 
-    def _validate_training(self, val_data, batch_size_test=32,
-                           verbose_log=True, verbose_keras=False):
+    def _validate_training(
+        self, val_data, batch_size_test=32, verbose_log=True, verbose_keras=False
+    ):
         """
         Validate the model on the test set and return the evaluation metrics.
 
@@ -413,28 +432,32 @@ def _validate_training(self, val_data, batch_size_test=32,
         y_val_pred = []
         y_val_test = []
         for x_val, y_val in val_data:
-            y_val_pred.append(self._model.predict(
-                x_val, batch_size=batch_size_test, verbose=verbose_keras)[1])
+            y_val_pred.append(
+                self._model.predict(
+                    x_val, batch_size=batch_size_test, verbose=verbose_keras
+                )[1]
+            )
             y_val_test.append(np.argmax(y_val, axis=-1))
             batch_id += 1
             sys.stdout.flush()
             if verbose_log:
-                sys.stdout.write("\rEPOCH %g, validation_batch_id %d" %
-                                 (self._epoch_id, batch_id))
+                sys.stdout.write(
+                    "\rEPOCH %g, validation_batch_id %d" % (self._epoch_id, batch_id)
+                )
 
-        tf.keras.backend.set_floatx('float32')
+        tf.keras.backend.set_floatx("float32")
         # Clean the predicted entities and the actual entities
         f1, f1_report = labeler_utils.evaluate_accuracy(
             np.concatenate(y_val_pred, axis=0),
             np.concatenate(y_val_test, axis=0),
             self.num_labels,
             self.reverse_label_mapping,
-            verbose=verbose_keras)
+            verbose=verbose_keras,
+        )
 
         return f1, f1_report
 
-    def predict(self, data, batch_size=32, show_confidences=False,
-                verbose=True):
+    def predict(self, data, batch_size=32, show_confidences=False, verbose=True):
         """
         Run model and get predictions
 
@@ -452,10 +475,12 @@ def predict(self, data, batch_size=32, show_confidences=False,
         if not self._model:
             self._construct_model()
         elif self._need_to_reconstruct_model():
-            raise RuntimeError("The model label mapping definitions have been "
-                               "altered without additional training. Please "
-                               "train the model or reset the label mapping to "
-                               "predict.")
+            raise RuntimeError(
+                "The model label mapping definitions have been "
+                "altered without additional training. Please "
+                "train the model or reset the label mapping to "
+                "predict."
+            )
         # Pre-allocate space for predictions
         confidences = []
         predictions = []
@@ -463,9 +488,7 @@ def predict(self, data, batch_size=32, show_confidences=False,
         # Run model with batching
         allocation_index = 0
         for batch_id, batch_data in enumerate(data):
-            model_output = self._model(
-                tf.convert_to_tensor(batch_data)
-            )
+            model_output = self._model(tf.convert_to_tensor(batch_data))
 
             # Count number of samples in batch to prevent array mismatch
             num_samples_in_batch = len(batch_data)
@@ -477,8 +500,12 @@ def predict(self, data, batch_size=32, show_confidences=False,
                     confidences += confidences
 
             if show_confidences:
-                confidences[allocation_index:allocation_index + num_samples_in_batch] = model_output[0].numpy()
-            predictions[allocation_index:allocation_index + num_samples_in_batch] = model_output[1].numpy()
+                confidences[
+                    allocation_index : allocation_index + num_samples_in_batch
+                ] = model_output[0].numpy()
+            predictions[
+                allocation_index : allocation_index + num_samples_in_batch
+            ] = model_output[1].numpy()
 
             allocation_index += num_samples_in_batch
 
@@ -486,12 +513,11 @@ def predict(self, data, batch_size=32, show_confidences=False,
         predictions = [predictions[i].tolist() for i in range(allocation_index)]
         confidences_list = None
         if show_confidences:
-            confidences = [confidences[i].tolist()
-                           for i in range(0, allocation_index)]
+            confidences = [confidences[i].tolist() for i in range(0, allocation_index)]
 
         if show_confidences:
-            return {'pred': predictions, 'conf': confidences}
-        return {'pred': predictions}
+            return {"pred": predictions, "conf": confidences}
+        return {"pred": predictions}
 
     def details(self):
         """

From 032ad8e192f8b704cc1835af4fee1c04acedf4e9 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Wed, 13 Jul 2022 09:37:33 -0500
Subject: [PATCH 13/14] fix: reformat tests for isort / black

---
 .../test_char_load_tf_data_labeler.py         | 121 +++++++++---------
 .../tests/labelers/test_char_tf_load_model.py |  35 +++--
 2 files changed, 73 insertions(+), 83 deletions(-)

diff --git a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py
index 2f8237db5..a5bcaf6da 100644
--- a/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py
+++ b/dataprofiler/tests/labelers/test_char_load_tf_data_labeler.py
@@ -4,56 +4,45 @@
 from io import StringIO
 from unittest import mock
 
-from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, \
-    data_processing
-from dataprofiler.labelers.char_load_tf_model import \
-    CharLoadTFModel
-
+from dataprofiler.labelers import DataLabeler, UnstructuredDataLabeler, data_processing
+from dataprofiler.labelers.char_load_tf_model import CharLoadTFModel
 
 test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 
 data_labeler_parameters = {
-    'model': {
-        'class': 'CharLoadTFModel',
-        'parameters': {}
-    },
-    'label_mapping': {
-        'PAD': 0,
-        'CITY': 1,  # SAME AS UNKNOWN
-        'UNKNOWN': 1,
-        'ADDRESS': 2,
-        'PERSON': 3,
-    },
-    'preprocessor': {
-        'class': 'CharEncodedPreprocessor'
-    },
-    'postprocessor': {
-        'class': 'CharPostprocessor'
+    "model": {"class": "CharLoadTFModel", "parameters": {}},
+    "label_mapping": {
+        "PAD": 0,
+        "CITY": 1,  # SAME AS UNKNOWN
+        "UNKNOWN": 1,
+        "ADDRESS": 2,
+        "PERSON": 3,
     },
+    "preprocessor": {"class": "CharEncodedPreprocessor"},
+    "postprocessor": {"class": "CharPostprocessor"},
 }
 
 preprocessor_parameters = {
-    'encoding_map': {'t': 1, 's': 2},
-    'flatten_split': 0,
-    'flatten_separator': ' ',
-    'is_separate_at_max_len': True,
-
+    "encoding_map": {"t": 1, "s": 2},
+    "flatten_split": 0,
+    "flatten_separator": " ",
+    "is_separate_at_max_len": True,
 }
 
 postprocessor_parameters = {
-    'use_word_level_argmax': True,
-    'output_format': 'character_argmax',
-    'separators': (' ', ',', ';', "'", '"', ':', '\n', '\t', "."),
-    'word_level_min_percent': 0.75,
+    "use_word_level_argmax": True,
+    "output_format": "character_argmax",
+    "separators": (" ", ",", ";", "'", '"', ":", "\n", "\t", "."),
+    "word_level_min_percent": 0.75,
 }
 
 
 def mock_open(filename, *args):
-    if filename.find('data_labeler_parameters') >= 0:
+    if filename.find("data_labeler_parameters") >= 0:
         return StringIO(json.dumps(data_labeler_parameters))
-    elif filename.find('preprocessor_parameters') >= 0:
+    elif filename.find("preprocessor_parameters") >= 0:
         return StringIO(json.dumps(preprocessor_parameters))
-    elif filename.find('postprocessor_parameters') >= 0:
+    elif filename.find("postprocessor_parameters") >= 0:
         return StringIO(json.dumps(postprocessor_parameters))
 
 
@@ -64,68 +53,71 @@ def setup_save_mock_open(mock_open):
     return mock_file
 
 
-@mock.patch('dataprofiler.labelers.data_processing.BaseDataProcessor')
-@mock.patch('dataprofiler.labelers.char_load_tf_model.'
-            'CharLoadTFModel.load_from_disk')
+@mock.patch("dataprofiler.labelers.data_processing.BaseDataProcessor")
+@mock.patch(
+    "dataprofiler.labelers.char_load_tf_model." "CharLoadTFModel.load_from_disk"
+)
 @mock.patch("builtins.open", side_effect=mock_open)
 class TestCharTFLoadDataLabeler(unittest.TestCase):
-
     @staticmethod
     def _setup_mock_load_model(mock_load_model):
         model_mock = mock.Mock(spec=CharLoadTFModel)
         model_mock.set_num_labels = mock.Mock()
         mock_load_model.return_value = model_mock
         model_mock.requires_zero_mapping = True
-        model_mock.labels = ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON']
+        model_mock.labels = ["PAD", "UNKNOWN", "ADDRESS", "PERSON"]
         model_mock.label_mapping = {
-            'PAD': 0,
-            'CITY': 1,  # SAME AS UNKNOWN
-            'UNKNOWN': 1,
-            'ADDRESS': 2,
-            'PERSON': 3,
+            "PAD": 0,
+            "CITY": 1,  # SAME AS UNKNOWN
+            "UNKNOWN": 1,
+            "ADDRESS": 2,
+            "PERSON": 3,
         }
         model_mock.reverse_label_mapping = {
-            0: 'PAD',
-            1: 'UNKNOWN',
-            2: 'ADDRESS',
-            3: 'PERSON',
+            0: "PAD",
+            1: "UNKNOWN",
+            2: "ADDRESS",
+            3: "PERSON",
         }
 
     @staticmethod
     def _setup_mock_load_processor(mock_base_processor):
         def side_effect(arg):
             processor = {
-                'CharEncodedPreprocessor': mock.Mock(
-                    spec=data_processing.CharEncodedPreprocessor),
-                'CharPostprocessor': mock.Mock(
-                    spec=data_processing.CharPostprocessor),
+                "CharEncodedPreprocessor": mock.Mock(
+                    spec=data_processing.CharEncodedPreprocessor
+                ),
+                "CharPostprocessor": mock.Mock(spec=data_processing.CharPostprocessor),
             }[arg]
             processor.load_from_disk.return_value = processor
             return processor
 
         mock_base_processor.get_class.side_effect = side_effect
 
-    def test_load_from_disk(self, mock_open, mock_load_model,
-                            mock_base_processor):
+    def test_load_from_disk(self, mock_open, mock_load_model, mock_base_processor):
 
         self._setup_mock_load_model(mock_load_model)
         self._setup_mock_load_processor(mock_base_processor)
 
         # load default
-        data_labeler = DataLabeler.load_from_disk('fake/path')
+        data_labeler = DataLabeler.load_from_disk("fake/path")
 
-        self.assertDictEqual(data_labeler.label_mapping,
-                             data_labeler_parameters['label_mapping'])
+        self.assertDictEqual(
+            data_labeler.label_mapping, data_labeler_parameters["label_mapping"]
+        )
         self.assertListEqual(
-            data_labeler.labels,
-            ['PAD', 'UNKNOWN', 'ADDRESS', 'PERSON'])
+            data_labeler.labels, ["PAD", "UNKNOWN", "ADDRESS", "PERSON"]
+        )
         self.assertIsInstance(
-            data_labeler.preprocessor, data_processing.BaseDataPreprocessor)
+            data_labeler.preprocessor, data_processing.BaseDataPreprocessor
+        )
         self.assertIsInstance(
-            data_labeler.postprocessor, data_processing.BaseDataPostprocessor)
+            data_labeler.postprocessor, data_processing.BaseDataPostprocessor
+        )
 
-    def test_save_to_disk(self, mock_open, mock_load_model,
-                          mock_load_processor, *mocks):
+    def test_save_to_disk(
+        self, mock_open, mock_load_model, mock_load_processor, *mocks
+    ):
 
         self._setup_mock_load_model(mock_load_model)
         self._setup_mock_load_processor(mock_load_processor)
@@ -137,12 +129,13 @@ def test_save_to_disk(self, mock_open, mock_load_model,
         mock_file = setup_save_mock_open(mock_open)
 
         # save and test
-        data_labeler.save_to_disk('test/path')
+        data_labeler.save_to_disk("test/path")
         self.assertEqual(
             '{"model": {"class": "CharLoadTFModel"}, '
             '"preprocessor": {"class": "CharEncodedPreprocessor"}, '
             '"postprocessor": {"class": "CharPostprocessor"}}',
-            mock_file.getvalue())
+            mock_file.getvalue(),
+        )
 
         # close mock
         StringIO.close(mock_file)
diff --git a/dataprofiler/tests/labelers/test_char_tf_load_model.py b/dataprofiler/tests/labelers/test_char_tf_load_model.py
index 8f06af6f2..fbfde0c49 100644
--- a/dataprofiler/tests/labelers/test_char_tf_load_model.py
+++ b/dataprofiler/tests/labelers/test_char_tf_load_model.py
@@ -32,11 +32,14 @@
 def mock_tf_model(*args, **kwargs):
     model = tf.keras.models.Sequential()
     model.add(tf.keras.layers.Input(shape=(None,), dtype=tf.int64))
-    model.add(tf.keras.layers.Embedding(
-        input_dim=100,
-        output_dim=30,
-        embeddings_initializer="normal",
-        trainable=True))
+    model.add(
+        tf.keras.layers.Embedding(
+            input_dim=100,
+            output_dim=30,
+            embeddings_initializer="normal",
+            trainable=True,
+        )
+    )
     model.add(tf.keras.layers.Dense(units=10, activation="relu"))
     model.add(tf.keras.layers.Dense(10, activation="softmax"))
     return model
@@ -56,7 +59,7 @@ def setup_save_mock_open(mock_open):
     return mock_file
 
 
-@mock.patch('tensorflow.keras.models.load_model', side_effect=mock_tf_model)
+@mock.patch("tensorflow.keras.models.load_model", side_effect=mock_tf_model)
 class TestCharLoadTFModel(unittest.TestCase):
     @classmethod
     def setUpClass(cls):
@@ -121,7 +124,7 @@ def test_init(self, *mocks):
         ]
 
         self.assertDictEqual(self.label_mapping, model.label_mapping)
-        self.assertEqual(self.model_path, model._parameters['model_path'])
+        self.assertEqual(self.model_path, model._parameters["model_path"])
         self.assertListEqual(expected_labels, model.labels)
 
     def test_reverse_label_mapping(self, *mocks):
@@ -151,8 +154,7 @@ def test_reverse_label_mapping(self, *mocks):
         }
 
         self.assertDictEqual(
-            expected_reverse_label_mapping,
-            model.reverse_label_mapping
+            expected_reverse_label_mapping, model.reverse_label_mapping
         )
 
     def test_set_label_mapping(self, *mocks):
@@ -219,15 +221,12 @@ def test_predict(self, *mocks):
         data_gen = [np.array([[1, 3], [1, 2]])]
         result = model.predict(data_gen)
         self.assertIn("pred", result)
-        self.assertEqual((2, 2), np.array(result['pred']).shape)
+        self.assertEqual((2, 2), np.array(result["pred"]).shape)
 
         result = model.predict(data_gen, show_confidences=True)
         self.assertIn("pred", result)
         self.assertIn("conf", result)
-        self.assertEqual(
-            (2, 2, model.num_labels),
-            np.array(result['conf']).shape
-        )
+        self.assertEqual((2, 2, model.num_labels), np.array(result["conf"]).shape)
 
     def test_fit_and_predict(self, *mocks):
         # model
@@ -260,7 +259,7 @@ def test_fit_and_predict(self, *mocks):
             "TEST": 1,
             "NEW": 2,
             "MAPPING": 3,
-            model._parameters['default_label']: 4,
+            model._parameters["default_label"]: 4,
         }
         data_gen = [
             [
@@ -304,9 +303,7 @@ def test_param_validation(self, *mocks):
             "fake_extra_param": "fails",
         }
         model = CharLoadTFModel(
-            self.model_path,
-            label_mapping=self.label_mapping,
-            parameters=parameters
+            self.model_path, label_mapping=self.label_mapping, parameters=parameters
         )
         model._construct_model()
         self.assertDictEqual(parameters, model._parameters)
@@ -314,7 +311,7 @@ def test_param_validation(self, *mocks):
             CharLoadTFModel(
                 self.model_path,
                 label_mapping=self.label_mapping,
-                parameters=invalid_parameters
+                parameters=invalid_parameters,
             )
 
     @mock.patch("sys.stdout", new_callable=StringIO)

From 4002125a1335bbd281af484b9d8089d8f5d29556 Mon Sep 17 00:00:00 2001
From: Jeremy Goodsitt <jeremy.goodsitt@gmail.com>
Date: Wed, 13 Jul 2022 10:14:38 -0500
Subject: [PATCH 14/14] fix: variable suggestion

---
 dataprofiler/labelers/char_load_tf_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/labelers/char_load_tf_model.py b/dataprofiler/labelers/char_load_tf_model.py
index 01e2f6bb8..5357dfb73 100644
--- a/dataprofiler/labelers/char_load_tf_model.py
+++ b/dataprofiler/labelers/char_load_tf_model.py
@@ -95,7 +95,7 @@ def _validate_parameters(self, parameters):
 
         # Make sure the necessary parameters are present and valid.
         for param in parameters:
-            if param in ["default_label", "model_path", "pad_label"]:
+            if param in list_of_necessary_params:
                 if not isinstance(parameters[param], str):
                     error = str(param) + " must be a string."
                     errors.append(error)