diff --git a/tods/detection_algorithm/core/AutoRegOD.py b/tods/detection_algorithm/core/AutoRegOD.py new file mode 100644 index 00000000..a2286dca --- /dev/null +++ b/tods/detection_algorithm/core/AutoRegOD.py @@ -0,0 +1,198 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for univariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from sklearn.linear_model import LinearRegression + +from .CollectiveBase import CollectiveBaseDetector + +from .utility import get_sub_matrices + + +class AutoRegOD(CollectiveBaseDetector): + """Autoregressive models use linear regression to calculate a sample's + deviance from the predicted value, which is then used as its + outlier scores. This model is for univariate time series. + See MultiAutoRegOD for multivariate data. + + See :cite:`aggarwal2015outlier` Chapter 9 for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1): + super(AutoRegOD, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # generate X and y + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + window_size=self.window_size, + step=self.step_size, + return_numpy=True, + flatten=True) + # remove the last one + sub_matrices = sub_matrices[:-1, :] + self.left_inds_ = self.left_inds_[:-1] + self.right_inds_ = self.right_inds_[:-1] + + self.valid_len_ = sub_matrices.shape[0] + + y_buf = np.zeros([self.valid_len_, 1]) + + for i in range(self.valid_len_): + y_buf[i] = X[i * self.step_size + self.window_size] + # print(sub_matrices.shape, y_buf.shape) + + # fit the linear regression model + self.lr_ = LinearRegression(fit_intercept=True) + self.lr_.fit(sub_matrices, y_buf) + self.decision_scores_ = np.absolute( + y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) + + self._process_decision_scores() + return self + + def predict(self, X): # pragma: no cover + """Predict if a particular sample is an outlier or not. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + Returns + ------- + outlier_labels : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. 0 stands for inliers and 1 for outliers. + """ + + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + + pred_score, X_left_inds, X_right_inds = self.decision_function(X) + + pred_score = np.concatenate((np.zeros((self.window_size,)), pred_score)) + X_left_inds = np.concatenate((np.zeros((self.window_size,)), X_left_inds)) + X_right_inds = np.concatenate((np.zeros((self.window_size,)), X_right_inds)) + + return (pred_score > self.threshold_).astype( + 'int').ravel(), X_left_inds.ravel(), X_right_inds.ravel() + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['lr_']) + + sub_matrices, X_left_inds, X_right_inds = \ + get_sub_matrices(X, + window_size=self.window_size, + step=self.step_size, + return_numpy=True, + flatten=True) + + # remove the last one + sub_matrices = sub_matrices[:-1, :] + X_left_inds = X_left_inds[:-1] + X_right_inds = X_right_inds[:-1] + + valid_len = sub_matrices.shape[0] + + y_buf = np.zeros([valid_len, 1]) + + for i in range(valid_len): + y_buf[i] = X[i * self.step_size + self.window_size] + + pred_score = np.absolute( + y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) + + return pred_score, X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": # pragma: no cover + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + 1) + + clf = AutoRegOD(window_size=3, contamination=0.2) + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/CollectiveBase.py b/tods/detection_algorithm/core/CollectiveBase.py new file mode 100644 index 00000000..67207bfe --- /dev/null +++ b/tods/detection_algorithm/core/CollectiveBase.py @@ -0,0 +1,476 @@ +# -*- coding: utf-8 -*- +"""Base class for all Collective outlier detector models +""" + +from __future__ import division +from __future__ import print_function + +import warnings +from collections import defaultdict + +from inspect import signature + +import abc +from abc import ABCMeta + +import numpy as np +from numpy import percentile +from scipy.special import erf +from sklearn.preprocessing import MinMaxScaler +from sklearn.utils import deprecated +from sklearn.utils.validation import check_is_fitted +from sklearn.utils.multiclass import check_classification_targets + + +def _pprint(params, offset=0, printer=repr): # pragma: no cover + # noinspection PyPep8 + """Pretty print the dictionary 'params' + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + :param params: The dictionary to pretty print + :type params: dict + + :param offset: The offset in characters to add at the begin of each line. + :type offset: int + + :param printer: The function to convert entries to strings, typically + the builtin str or repr + :type printer: callable + + :return: None + """ + + # Do a multi-line justified repr: + options = np.get_printoptions() + np.set_printoptions(precision=5, threshold=64, edgeitems=2) + params_list = list() + this_line_length = offset + line_sep = ',\n' + (1 + offset // 2) * ' ' + for i, (k, v) in enumerate(sorted(params.items())): + if type(v) is float: + # use str for representing floating point numbers + # this way we get consistent representation across + # architectures and versions. + this_repr = '%s=%s' % (k, str(v)) + else: + # use repr of the rest + this_repr = '%s=%s' % (k, printer(v)) + if len(this_repr) > 500: + this_repr = this_repr[:300] + '...' + this_repr[-100:] + if i > 0: + if this_line_length + len(this_repr) >= 75 or '\n' in this_repr: + params_list.append(line_sep) + this_line_length = len(line_sep) + else: + params_list.append(', ') + this_line_length += 2 + params_list.append(this_repr) + this_line_length += len(this_repr) + + np.set_printoptions(**options) + lines = ''.join(params_list) + # Strip trailing space to avoid nightmare in doctests + lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) + return lines + + +class CollectiveBaseDetector(metaclass=ABCMeta): + """Abstract class for all outlier detection algorithms. + + Parameters + ---------- + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + window_size : int, optional (default=1) + The moving window size. + + step_size :, optional (default=1) + The displacement for moving window. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + @abc.abstractmethod + def __init__(self, contamination=0.1, + window_size=1, + step_size=1): # pragma: no cover + + if not (0. < contamination <= 0.5): + raise ValueError("contamination must be in (0, 0.5], " + "got: %f" % contamination) + + self.contamination = contamination + self.window_size = window_size + self.step_size = step_size + self._classes = 2 # leave the parameter on for extension + self.left_inds_ = None + self.right_inds = None + + # noinspection PyIncorrectDocstring + @abc.abstractmethod + def fit(self, X, y=None): # pragma: no cover + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + pass + + @abc.abstractmethod + def decision_function(self, X): # pragma: no cover + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + pass + + @deprecated() + def fit_predict(self, X, y=None): # pragma: no cover + """Fit detector first and then predict whether a particular sample + is an outlier or not. y is ignored in unsupervised models. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + outlier_labels : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. 0 stands for inliers and 1 for outliers. + + .. deprecated:: 0.6.9 + `fit_predict` will be removed in pyod 0.8.0.; it will be + replaced by calling `fit` function first and then accessing + `labels_` attribute for consistency. + """ + + self.fit(X, y) + return self.labels_ + + def predict(self, X): # pragma: no cover + """Predict if a particular sample is an outlier or not. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + Returns + ------- + outlier_labels : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. 0 stands for inliers and 1 for outliers. + """ + + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + + pred_score, X_left_inds, X_right_inds = self.decision_function(X) + + return (pred_score > self.threshold_).astype( + 'int').ravel(), X_left_inds.ravel(), X_right_inds.ravel() + + def predict_proba(self, X, method='linear'): # pragma: no cover + """Predict the probability of a sample being outlier. Two approaches + are possible: + + 1. simply use Min-max conversion to linearly transform the outlier + scores into the range of [0,1]. The model must be + fitted first. + 2. use unifying scores, see :cite:`kriegel2011interpreting`. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + method : str, optional (default='linear') + probability conversion method. It must be one of + 'linear' or 'unify'. + + Returns + ------- + outlier_probability : numpy array of shape (n_samples,) + For each observation, tells whether or not + it should be considered as an outlier according to the + fitted model. Return the outlier probability, ranging + in [0,1]. + """ + + check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) + train_scores = self.decision_scores_ + + test_scores, X_left_inds, X_right_inds = self.decision_function(X) + + probs = np.zeros([test_scores.shape[0], int(self._classes)]) + if method == 'linear': + scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) + probs[:, 1] = scaler.transform( + test_scores.reshape(-1, 1)).ravel().clip(0, 1) + probs[:, 0] = 1 - probs[:, 1] + return probs, X_left_inds.ravel(), X_right_inds.ravel() + + elif method == 'unify': + # turn output into probability + pre_erf_score = (test_scores - self._mu) / ( + self._sigma * np.sqrt(2)) + erf_score = erf(pre_erf_score) + probs[:, 1] = erf_score.clip(0, 1).ravel() + probs[:, 0] = 1 - probs[:, 1] + return probs, X_left_inds.ravel(), X_right_inds.ravel() + else: + raise ValueError(method, + 'is not a valid probability conversion method') + + def _predict_rank(self, X, normalized=False): # pragma: no cover + """Predict the outlyingness rank of a sample by a fitted model. The + method is for outlier detector score combination. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + normalized : bool, optional (default=False) + If set to True, all ranks are normalized to [0,1]. + + Returns + ------- + ranks : array, shape (n_samples,) + Outlying rank of a sample according to the training data. + + """ + + check_is_fitted(self, ['decision_scores_']) + + test_scores = self.decision_function(X) + train_scores = self.decision_scores_ + + sorted_train_scores = np.sort(train_scores) + ranks = np.searchsorted(sorted_train_scores, test_scores) + + if normalized: + # return normalized ranks + ranks = ranks / ranks.max() + return ranks + + def _set_n_classes(self, y): # pragma: no cover + """Set the number of classes if `y` is presented, which is not + expected. It could be useful for multi-class outlier detection. + + Parameters + ---------- + y : numpy array of shape (n_samples,) + Ground truth. + + Returns + ------- + self + """ + + self._classes = 2 # default as binary classification + if y is not None: + check_classification_targets(y) + self._classes = len(np.unique(y)) + warnings.warn( + "y should not be presented in unsupervised learning.") + return self + + def _process_decision_scores(self): # pragma: no cover + """Internal function to calculate key attributes: + + - threshold_: used to decide the binary label + - labels_: binary labels of training data + + Returns + ------- + self + """ + + self.threshold_ = percentile(self.decision_scores_, + 100 * (1 - self.contamination)) + self.labels_ = (self.decision_scores_ > self.threshold_).astype( + 'int').ravel() + + # calculate for predict_proba() + + self._mu = np.mean(self.decision_scores_) + self._sigma = np.std(self.decision_scores_) + + return self + + # noinspection PyMethodParameters + def _get_param_names(cls): # pragma: no cover + # noinspection PyPep8 + """Get parameter names for the estimator + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + """ + + # fetch the constructor or the original constructor before + # deprecation wrapping if any + init = getattr(cls.__init__, 'deprecated_original', cls.__init__) + if init is object.__init__: + # No explicit constructor to introspect + return [] + + # introspect the constructor arguments to find the model parameters + # to represent + init_signature = signature(init) + # Consider the constructor parameters excluding 'self' + parameters = [p for p in init_signature.parameters.values() + if p.name != 'self' and p.kind != p.VAR_KEYWORD] + for p in parameters: + if p.kind == p.VAR_POSITIONAL: + raise RuntimeError("scikit-learn estimators should always " + "specify their parameters in the signature" + " of their __init__ (no varargs)." + " %s with constructor %s doesn't " + " follow this convention." + % (cls, init_signature)) + # Extract and sort argument names excluding 'self' + return sorted([p.name for p in parameters]) + + # noinspection PyPep8 + def get_params(self, deep=True): # pragma: no cover + """Get parameters for this estimator. + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + Parameters + ---------- + deep : bool, optional (default=True) + If True, will return the parameters for this estimator and + contained subobjects that are estimators. + + Returns + ------- + params : mapping of string to any + Parameter names mapped to their values. + """ + + out = dict() + for key in self._get_param_names(): + # We need deprecation warnings to always be on in order to + # catch deprecated param values. + # This is set in utils/__init__.py but it gets overwritten + # when running under python3 somehow. + warnings.simplefilter("always", DeprecationWarning) + try: + with warnings.catch_warnings(record=True) as w: + value = getattr(self, key, None) + if len(w) and w[0].category == DeprecationWarning: + # if the parameter is deprecated, don't show it + continue + finally: + warnings.filters.pop(0) + + # XXX: should we rather test if instance of estimator? + if deep and hasattr(value, 'get_params'): + deep_items = value.get_params().items() + out.update((key + '__' + k, val) for k, val in deep_items) + out[key] = value + return out + + def set_params(self, **params): # pragma: no cover + # noinspection PyPep8 + """Set the parameters of this estimator. + The method works on simple estimators as well as on nested objects + (such as pipelines). The latter have parameters of the form + ``__`` so that it's possible to update each + component of a nested object. + + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + + Returns + ------- + self : object + """ + + if not params: + # Simple optimization to gain speed (inspect is slow) + return self + valid_params = self.get_params(deep=True) + + nested_params = defaultdict(dict) # grouped by prefix + for key, value in params.items(): + key, delim, sub_key = key.partition('__') + if key not in valid_params: + raise ValueError('Invalid parameter %s for estimator %s. ' + 'Check the list of available parameters ' + 'with `estimator.get_params().keys()`.' % + (key, self)) + + if delim: + nested_params[key][sub_key] = value + else: + setattr(self, key, value) + + for key, sub_params in nested_params.items(): + valid_params[key].set_params(**sub_params) + + return self + + def __repr__(self): # pragma: no cover + # noinspection PyPep8 + """ + See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html + and sklearn/base.py for more information. + """ + + class_name = self.__class__.__name__ + return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), + offset=len(class_name), ),) diff --git a/tods/detection_algorithm/core/CollectiveCommonTest.py b/tods/detection_algorithm/core/CollectiveCommonTest.py new file mode 100755 index 00000000..1eb6b986 --- /dev/null +++ b/tods/detection_algorithm/core/CollectiveCommonTest.py @@ -0,0 +1,174 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import numpy as np +import unittest +# noinspection PyProtectedMember + +from numpy.testing import assert_equal +from numpy.testing import assert_allclose +from numpy.testing import assert_array_less +from numpy.testing import assert_raises + +from unittest import TestCase + +from sklearn.utils.estimator_checks import check_estimator + +from sklearn.metrics import roc_auc_score +from scipy.stats import rankdata + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pyod.utils.data import generate_data + +_dummy = TestCase('__init__') +assert_greater = _dummy.assertGreater +assert_greater_equal = _dummy.assertGreaterEqual +assert_less = _dummy.assertLess +assert_less_equal = _dummy.assertLessEqual + + +class CollectiveCommonTest: + def __init__(self, + model, + X_train, + y_train, + X_test, + y_test, + roc_floor, + ): + self.clf = model + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + self.roc_floor = roc_floor + + self.clf.fit(self.X_train) + + pass + + def test_detector(self): + + self.test_parameters() + self.test_train_scores() + self.test_train_inds() + self.test_prediction_scores() + self.test_prediction_proba() + self.test_prediction_proba_linear() + self.test_prediction_proba_unify() + self.test_prediction_proba_parameter() + # self.test_fit_predict() + # self.test_fit_predict_score() + self.test_prediction_labels() + self.test_prediction_inds() + # self.test_predict_rank() + # self.test_predict_rank_normalized() + self.tearDown() + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + assert (hasattr(self.clf, 'threshold_') and + self.clf.threshold_ is not None) + assert (hasattr(self.clf, 'left_inds_') and + self.clf.left_inds_ is not None) + assert (hasattr(self.clf, 'right_inds_') and + self.clf.right_inds_ is not None) + assert (hasattr(self.clf, '_mu') and + self.clf._mu is not None) + assert (hasattr(self.clf, '_sigma') and + self.clf._sigma is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) + + def test_train_inds(self): + inds_valid = self.clf.left_inds_ < self.clf.right_inds_ + assert_equal(self.clf.left_inds_.shape, self.clf.decision_scores_.shape) + assert_equal(self.clf.right_inds_.shape, self.clf.decision_scores_.shape) + assert_equal(all(inds_valid), True) + + def test_prediction_scores(self): + pred_scores, _, _ = self.clf.decision_function(self.X_test) + # check score shapes + assert_equal(pred_scores.shape[0], self.y_test.shape[0]) + + # check performance + assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) + + def test_prediction_labels(self): + pred_labels, _, _ = self.clf.predict(self.X_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_inds(self): + _, left_inds, right_inds = self.clf.predict(self.X_test) + inds_valid = left_inds < right_inds + + assert_equal(left_inds.shape, self.y_test.shape) + assert_equal(right_inds.shape, self.y_test.shape) + assert_equal(all(inds_valid), True) + + + def test_prediction_proba(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test) + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_linear(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='linear') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_unify(self): + pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='unify') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_fit_predict(self): # pragma: no cover + pred_labels, _, _ = self.clf.fit_predict(X=self.X_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): # pragma: no cover + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_predict_rank(self): # pragma: no cover + pred_socres, _, _ = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, self.X_train.shape[0] + 1) + assert_array_less(-0.1, pred_ranks) + + def test_predict_rank_normalized(self): # pragma: no cover + pred_socres, _, _ = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, 1.01) + assert_array_less(-0.1, pred_ranks) + + def tearDown(self): + pass diff --git a/tods/detection_algorithm/core/KDiscord.py b/tods/detection_algorithm/core/KDiscord.py new file mode 100644 index 00000000..ff74a217 --- /dev/null +++ b/tods/detection_algorithm/core/KDiscord.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted + +from .CollectiveBase import CollectiveBaseDetector +from pyod.models.knn import KNN + +from .utility import get_sub_matrices + + +# TODO: add an argument to exclude "near equal" samples +# TODO: another thought is to treat each dimension independent +class KDiscord(CollectiveBaseDetector): + """KDiscord first split multivariate time series into + subsequences (matrices), and it use kNN outlier detection based on PyOD. + For an observation, its distance to its kth nearest neighbor could be + viewed as the outlying score. It could be viewed as a way to measure + the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for + details. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_neighbors : int, optional (default = 5) + Number of neighbors to use by default for k neighbors queries. + + method : str, optional (default='largest') + {'largest', 'mean', 'median'} + + - 'largest': use the distance to the kth neighbor as the outlier score + - 'mean': use the average of all k neighbors as the outlier score + - 'median': use the median of the distance to k neighbors as the + outlier score + + radius : float, optional (default = 1.0) + Range of parameter space to use by default for `radius_neighbors` + queries. + + algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional + Algorithm used to compute the nearest neighbors: + + - 'ball_tree' will use BallTree + - 'kd_tree' will use KDTree + - 'brute' will use a brute-force search. + - 'auto' will attempt to decide the most appropriate algorithm + based on the values passed to :meth:`fit` method. + + Note: fitting on sparse input will override the setting of + this parameter, using brute force. + + .. deprecated:: 0.74 + ``algorithm`` is deprecated in PyOD 0.7.4 and will not be + possible in 0.7.6. It has to use BallTree for consistency. + + leaf_size : int, optional (default = 30) + Leaf size passed to BallTree. This can affect the + speed of the construction and query, as well as the memory + required to store the tree. The optimal value depends on the + nature of the problem. + + metric : string or callable, default 'minkowski' + metric to use for distance computation. Any metric from scikit-learn + or scipy.spatial.distance can be used. + + If metric is a callable function, it is called on each + pair of instances (rows) and the resulting value recorded. The callable + should take two arrays as input and return one value indicating the + distance between them. This works for Scipy's metrics, but is less + efficient than passing the metric name as a string. + + Distance matrices are not supported. + + Valid values for metric are: + + - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', + 'manhattan'] + + - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', + 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', + 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', + 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', + 'sqeuclidean', 'yule'] + + See the documentation for scipy.spatial.distance for details on these + metrics. + + p : integer, optional (default = 2) + Parameter for the Minkowski metric from + sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is + equivalent to using manhattan_distance (l1), and euclidean_distance + (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. + See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances + + metric_params : dict, optional (default = None) + Additional keyword arguments for the metric function. + + n_jobs : int, optional (default = 1) + The number of parallel jobs to run for neighbors search. + If ``-1``, then the number of jobs is set to the number of CPU cores. + Affects only kneighbors and kneighbors_graph methods. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1, + n_neighbors=5, method='largest', + radius=1.0, algorithm='auto', leaf_size=30, + metric='minkowski', p=2, metric_params=None, n_jobs=1, + **kwargs): + super(KDiscord, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + # parameters for kNN + self.n_neighbors = n_neighbors + self.method = method + self.radius = radius + self.algorithm = algorithm + self.leaf_size = leaf_size + self.metric = metric + self.p = p + self.metric_params = metric_params + self.n_jobs = n_jobs + + # initialize a kNN model + self.model_ = KNN(contamination=self.contamination, + n_neighbors=self.n_neighbors, + radius=self.radius, + algorithm=self.algorithm, + leaf_size=self.leaf_size, + metric=self.metric, + p=self.p, + metric_params=self.metric_params, + n_jobs=self.n_jobs, + **kwargs) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # first convert it into submatrices, and flatten it + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True) + + # fit the kNN model + self.model_.fit(sub_matrices) + self.decision_scores_ = self.model_.decision_scores_ + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + X = check_array(X).astype(np.float) + # first convert it into submatrices, and flatten it + sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True) + + # return the prediction result by kNN + return self.model_.decision_function(sub_matrices), \ + X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": # pragma: no cover + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + 1) + + # X_train = np.asarray( + # [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], + # [12., 12], + # [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + # + # X_test = np.asarray( + # [[12., 10], [8., 12], [80., 80], [92., 983], + # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + # [22., 15]]) + + clf = KDiscord(window_size=3, step_size=1, contamination=0.2, + n_neighbors=5) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/LSTMOD.py b/tods/detection_algorithm/core/LSTMOD.py new file mode 100644 index 00000000..ee0cbaad --- /dev/null +++ b/tods/detection_algorithm/core/LSTMOD.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for univariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from scipy.special import erf +from sklearn.preprocessing import MinMaxScaler + +from .CollectiveBase import CollectiveBaseDetector + +# from tod.utility import get_sub_matrices + +from tensorflow.keras.layers import Dense, LSTM +from tensorflow.keras.models import Sequential + +class LSTMOutlierDetector(CollectiveBaseDetector): + + def __init__(self,contamination=0.1, + train_contamination=0.0, + min_attack_time=5, + danger_coefficient_weight=0.5, + loss='mean_squared_error', + optimizer='adam', + epochs=10, + batch_size=8, + dropout_rate=0.0, + feature_dim=9, + hidden_dim=1, + n_hidden_layer=0, + activation=None, + diff_group_method='average' + ): + + super(LSTMOutlierDetector, self).__init__(contamination=contamination, + # window_size=min_attack_time, + step_size=1, + ) + + self.train_contamination = train_contamination + self.min_attack_time = min_attack_time + self.danger_coefficient_weight = danger_coefficient_weight + self.relative_error_threshold = None + + self.loss = loss + self.optimizer = optimizer + self.epochs = epochs + self.batch_size = batch_size + self.dropout_rate = dropout_rate + self.feature_dim = feature_dim + self.hidden_dim = hidden_dim + self.n_hidden_layer = n_hidden_layer + self.diff_group_method = diff_group_method + self.activation = activation + + + # def _build_model(self): + # print('dim:', self.hidden_dim, self.feature_dim) + # model_ = Sequential() + # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.feature_dim, 1), + # dropout=self.dropout_rate, activation=self.activation, return_sequences=True)) + + # for layer_idx in range(self.n_hidden_layer-1): + # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), + # dropout=self.dropout_rate, activation=self.activation, return_sequences=True)) + + # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), + # dropout=self.dropout_rate, activation=self.activation)) + + # model_.add(Dense(units=self.feature_dim, input_shape=(self.hidden_dim, 1), activation=None)) + + # model_.compile(loss=self.loss, optimizer=self.optimizer) + # return model_ + + def _build_model(self): + model_ = Sequential() + model_.add(LSTM(units=self.hidden_dim, input_shape=(self.feature_dim, 1), + dropout=self.dropout_rate, activation=self.activation, + return_sequences=bool(self.n_hidden_layer>0))) + + for layer_idx in range(self.n_hidden_layer): + model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), + dropout=self.dropout_rate, activation=self.activation, + return_sequences=bool(layer_idx < self.n_hidden_layer - 1))) + + model_.add(Dense(units=self.feature_dim, input_shape=(self.hidden_dim, 1), activation=None)) + + model_.compile(loss=self.loss, optimizer=self.optimizer) + return model_ + + def fit(self, X: np.array, y=None) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + print("XXXX:", X.shape) + X = check_array(X).astype(np.float) + self._set_n_classes(None) + X_buf, y_buf = self._get_sub_matrices(X) + self.feature_dim = X_buf.shape[1] + self.model_ = self._build_model() + + # fit the LSTM model + self.model_.fit(X_buf, y_buf, epochs=self.epochs, batch_size=self.batch_size) + + relative_error = self._relative_error(X) + + if self.train_contamination < 1e-6: + self.relative_error_threshold = max(relative_error) + else: + self.relative_error_threshold = np.percentile(relative_error, 100 * (1 - self.train_contamination)) + + self.decision_scores_, self.left_inds_, self.right_inds_ = self.decision_function(X) + self._process_decision_scores() + + return self + + def _get_sub_matrices(self, X: np.array): + # return X[:-1].reshape(-1, 1, self.feature_dim), X[1:] + return np.expand_dims(X[:-1], axis=2), X[1:] + + + def _relative_error(self, X: np.array): + + X = check_array(X).astype(np.float) + X_buf, y_buf = self._get_sub_matrices(X) + + y_predict = self.model_.predict(X_buf) + + relative_error = (np.linalg.norm(y_predict - y_buf, axis=1) / np.linalg.norm(y_buf + 1e-6, axis=1)).ravel() + + return relative_error + + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + + relative_error = self._relative_error(X) + + error_num_buf = (relative_error > self.relative_error_threshold).astype(int) + + if not (self.diff_group_method in ['max', 'min', 'average']): + raise ValueError(self.diff_group_method, "is not a valid method") + + relative_error_left_inds = np.ones((len(relative_error), )) * len(relative_error) + relative_error_right_inds = np.zeros((len(relative_error), )) + + + if self.diff_group_method == 'average': + danger_coefficient = np.zeros(relative_error.shape) + averaged_relative_error = np.zeros(relative_error.shape) + calculated_times = np.zeros(relative_error.shape) + + for i in range(len(relative_error) - self.min_attack_time + 1): + dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time + are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time + + for j in range(self.min_attack_time): + averaged_relative_error[i + j] += are_tmp + danger_coefficient[i + j] += dc_tmp + calculated_times[i + j] += 1 + relative_error_left_inds[i + j] = i if i < relative_error_left_inds[i + j] else relative_error_left_inds[i + j] + relative_error_right_inds[i + j] = i+self.min_attack_time if i+self.min_attack_time > relative_error_right_inds[i + j] else relative_error_left_inds[i + j] + + # print(calculated_times) + danger_coefficient /= calculated_times + averaged_relative_error /= calculated_times + # print(danger_coefficient, averaged_relative_error) + + + else: # pragma: no cover + danger_coefficient = np.zeros(relative_error.shape) + averaged_relative_error = np.zeros(relative_error.shape) + + if self.diff_group_method == 'min': + danger_coefficient += float('inf') + averaged_relative_error += float('inf') + + for i in range(len(relative_error) - self.min_attack_time + 1): + dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time + are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time + + if self.diff_group_method == 'max': + for j in range(self.min_attack_time): + if are_tmp > averaged_relative_error[i + j] or dc_tmp > danger_coefficient[i+j]: + relative_error_left_inds[i + j] = i + relative_error_right_inds[i + j] = i+self.min_attack_time + if are_tmp > averaged_relative_error[i + j]: + averaged_relative_error[i + j] = are_tmp + if dc_tmp > danger_coefficient[i+j]: + danger_coefficient[i + j] = dc_tmp + + else: + for j in range(self.min_attack_time): + if are_tmp < averaged_relative_error[i + j] or dc_tmp < danger_coefficient[i+j]: + relative_error_left_inds[i + j] = i + relative_error_right_inds[i + j] = i+self.min_attack_time + if are_tmp < averaged_relative_error[i + j]: + averaged_relative_error[i + j] = are_tmp + if dc_tmp < danger_coefficient[i+j]: + danger_coefficient[i + j] = dc_tmp + + + # print(relative_error_left_inds) + # print(relative_error_right_inds) + pred_score = danger_coefficient * self.danger_coefficient_weight + averaged_relative_error * (1 - self.danger_coefficient_weight) + + pred_score = np.concatenate((np.zeros((self.window_size,)), pred_score)) + relative_error_left_inds = np.concatenate((np.arange(self.window_size), relative_error_left_inds+self.window_size)) + relative_error_right_inds = np.concatenate((np.arange(self.window_size)+self.window_size, relative_error_right_inds+self.window_size)) + + return pred_score, relative_error_left_inds, relative_error_right_inds + + +def main(): + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) + + X_test = np.asarray( + [3., 4., 8., 16.1, 18.2, 36.2, 57.1, -10.3, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1,1) + + print(X_train.shape, X_test.shape) + + clf = LSTMOutlierDetector(contamination=0.1) + clf.fit(X_train) + # pred_scores = clf.decision_function(X_test) + pred_labels, left_inds, right_inds = clf.predict(X_test) + + print(pred_labels.shape, left_inds.shape, right_inds.shape) + + print(clf.threshold_) + # print(np.percentile(pred_scores, 100 * 0.9)) + + # print('pred_scores: ',pred_scores) + print('pred_labels: ',pred_labels) + +if __name__ == "__main__": # pragma: no cover + main() diff --git a/tods/detection_algorithm/core/MultiAutoRegOD.py b/tods/detection_algorithm/core/MultiAutoRegOD.py new file mode 100644 index 00000000..01834bc9 --- /dev/null +++ b/tods/detection_algorithm/core/MultiAutoRegOD.py @@ -0,0 +1,238 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted +from sklearn.utils import column_or_1d + +from .CollectiveBase import CollectiveBaseDetector +from combo.models.score_comb import average, maximization, median, aom, moa +from combo.utils.utility import standardizer + +from .AutoRegOD import AutoRegOD +from .utility import get_sub_sequences_length + + +class MultiAutoRegOD(CollectiveBaseDetector): + """Autoregressive models use linear regression to calculate a sample's + deviance from the predicted value, which is then used as its + outlier scores. This model is for multivariate time series. + This model handles multivariate time series by various combination + approaches. See AutoRegOD for univarite data. + + See :cite:`aggarwal2015outlier,zhao2020using` for details. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, i.e. + the proportion of outliers in the data set. When fitting this is used + to define the threshold on the decision function. + + method : str, optional (default='average') + Combination method: {'average', 'maximization', + 'median'}. Pass in weights of detector for weighted version. + + weights : numpy array of shape (1, n_dimensions) + Score weight by dimensions. + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, method='average', + weights=None, contamination=0.1): + super(MultiAutoRegOD, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + self.method = method + self.weights = weights + + def _validate_weights(self): + """Internal function for validating and adjust weights. + + Returns + ------- + + """ + if self.weights is None: + self.weights = np.ones([1, self.n_models_]) + else: + self.weights = column_or_1d(self.weights).reshape( + 1, len(self.weights)) + assert (self.weights.shape[1] == self.n_models_) + + # adjust probability by a factor for integrity + adjust_factor = self.weights.shape[1] / np.sum(self.weights) + self.weights = self.weights * adjust_factor + + def _fit_univariate_model(self, X): + """Internal function for fitting one dimensional ts. + """ + X = check_array(X) + n_samples, n_sequences = X.shape[0], X.shape[1] + + models = [] + + # train one model for each dimension + for i in range(n_sequences): + models.append(AutoRegOD(window_size=self.window_size, + step_size=self.step_size, + contamination=self.contamination)) + models[i].fit(X[:, i].reshape(-1, 1)) + + return models + + def _score_combination(self, scores): # pragma: no cover + """Internal function for combining univarite scores. + """ + + # combine by different approaches + if self.method == 'average': + return average(scores, estimator_weights=self.weights) + if self.method == 'maximization': + return maximization(scores) + if self.method == 'median': + return median(scores) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # fit each dimension individually + self.models_ = self._fit_univariate_model(X) + self.valid_len_ = self.models_[0].valid_len_ + self.n_models_ = len(self.models_) + + # assign the left and right inds, same for all models + self.left_inds_ = self.models_[0].left_inds_ + self.right_inds_ = self.models_[0].right_inds_ + + # validate and adjust weights + self._validate_weights() + + # combine the scores from all dimensions + self._decison_mat = np.zeros([self.valid_len_, self.n_models_]) + for i in range(self.n_models_): + self._decison_mat[:, i] = self.models_[i].decision_scores_ + + # scale scores by standardization before score combination + self._decison_mat_scalaled, self._score_scalar = standardizer( + self._decison_mat, keep_scalar=True) + + self.decision_scores_ = self._score_combination( + self._decison_mat_scalaled) + + # print(self.decision_scores_.shape, self.left_inds_.shape, self.right_inds_.shape) + self.decision_scores_ = np.concatenate((np.zeros((self.window_size,)), self.decision_scores_)) + self.left_inds_ = np.concatenate(((-self.window_size) * np.ones((self.window_size,)).astype(np.int), self.left_inds_)) + self.right_inds_ = np.concatenate((np.zeros((self.window_size,)).astype(np.int), self.right_inds_)) + # print(self.decision_scores_.shape, self.left_inds_.shape, self.right_inds_.shape) + + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['models_']) + X = check_array(X).astype(np.float) + assert (X.shape[1] == self.n_models_) + n_samples = len(X) + + # need to subtract 1 because need to have y for subtraction + valid_len = get_sub_sequences_length(n_samples, self.window_size, + self.step_size) - 1 + + # combine the scores from all dimensions + decison_mat = np.zeros([valid_len, self.n_models_]) + for i in range(self.n_models_): + decison_mat[:, i], X_left_inds, X_right_inds = \ + self.models_[i].decision_function(X[:, i].reshape(-1, 1)) + + # scale the decision mat + decison_mat_scaled = self._score_scalar.transform(decison_mat) + decision_scores = self._score_combination(decison_mat_scaled) + + # print(decision_scores.shape, X_left_inds.shape, X_right_inds.shape) + decision_scores = np.concatenate((np.zeros((self.window_size,)), decision_scores)) + X_left_inds = np.concatenate(((-self.window_size)*np.ones((self.window_size,)).astype(np.int), X_left_inds)) + X_right_inds = np.concatenate((np.zeros((self.window_size,)).astype(np.int), X_right_inds)) + # print(decision_scores.shape, X_left_inds.shape, X_right_inds.shape) + + return decision_scores, X_left_inds, X_right_inds + + +if __name__ == "__main__": # pragma: no cover + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + X_test = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + # X_test = np.asarray( + # [[12., 10], [8., 12], [80., 80], [92., 983], + # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + # [22., 15]]) + + clf = MultiAutoRegOD(window_size=3, step_size=1, contamination=0.2) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/PCA.py b/tods/detection_algorithm/core/PCA.py new file mode 100644 index 00000000..502beb32 --- /dev/null +++ b/tods/detection_algorithm/core/PCA.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +"""Autoregressive model for multivariate time series outlier detection. +""" +import numpy as np +from sklearn.utils import check_array +from sklearn.utils.validation import check_is_fitted + +from .CollectiveBase import CollectiveBaseDetector +from pyod.models.pca import PCA as PCA_PYOD + +from .utility import get_sub_matrices + + +class PCA(CollectiveBaseDetector): + """PCA-based outlier detection with both univariate and multivariate + time series data. TS data will be first transformed to tabular format. + For univariate data, it will be in shape of [valid_length, window_size]. + for multivariate data with d sequences, it will be in the shape of + [valid_length, window_size]. + + Parameters + ---------- + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + contamination : float in (0., 0.5), optional (default=0.1) + The amount of contamination of the data set, + i.e. the proportion of outliers in the data set. Used when fitting to + define the threshold on the decision function. + + n_components : int, float, None or string + Number of components to keep. It should be smaller than the window_size. + if n_components is not set all components are kept:: + + n_components == min(n_samples, n_features) + + if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used + to guess the dimension + if ``0 < n_components < 1`` and svd_solver == 'full', select the number + of components such that the amount of variance that needs to be + explained is greater than the percentage specified by n_components + n_components cannot be equal to n_features for svd_solver == 'arpack'. + + n_selected_components : int, optional (default=None) + Number of selected principal components + for calculating the outlier scores. It is not necessarily equal to + the total number of the principal components. If not set, use + all principal components. + + copy : bool (default True) + If False, data passed to fit are overwritten and running + fit(X).transform(X) will not yield the expected results, + use fit_transform(X) instead. + + whiten : bool, optional (default False) + When True (False by default) the `components_` vectors are multiplied + by the square root of n_samples and then divided by the singular values + to ensure uncorrelated outputs with unit component-wise variances. + + Whitening will remove some information from the transformed signal + (the relative variance scales of the components) but can sometime + improve the predictive accuracy of the downstream estimators by + making their data respect some hard-wired assumptions. + + svd_solver : string {'auto', 'full', 'arpack', 'randomized'} + auto : + the solver is selected by a default policy based on `X.shape` and + `n_components`: if the input data is larger than 500x500 and the + number of components to extract is lower than 80% of the smallest + dimension of the data, then the more efficient 'randomized' + method is enabled. Otherwise the exact full SVD is computed and + optionally truncated afterwards. + full : + run exact full SVD calling the standard LAPACK solver via + `scipy.linalg.svd` and select the components by postprocessing + arpack : + run SVD truncated to n_components calling ARPACK solver via + `scipy.sparse.linalg.svds`. It requires strictly + 0 < n_components < X.shape[1] + randomized : + run randomized SVD by the method of Halko et al. + + tol : float >= 0, optional (default .0) + Tolerance for singular values computed by svd_solver == 'arpack'. + + iterated_power : int >= 0, or 'auto', (default 'auto') + Number of iterations for the power method computed by + svd_solver == 'randomized'. + + random_state : int, RandomState instance or None, optional (default None) + If int, random_state is the seed used by the random number generator; + If RandomState instance, random_state is the random number generator; + If None, the random number generator is the RandomState instance used + by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. + + weighted : bool, optional (default=True) + If True, the eigenvalues are used in score computation. + The eigenvectors with small eigenvalues comes with more importance + in outlier score calculation. + + standardization : bool, optional (default=True) + If True, perform standardization first to convert + data to zero mean and unit variance. + See http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html + + Attributes + ---------- + decision_scores_ : numpy array of shape (n_samples,) + The outlier scores of the training data. + The higher, the more abnormal. Outliers tend to have higher + scores. This value is available once the detector is + fitted. + + threshold_ : float + The threshold is based on ``contamination``. It is the + ``n_samples * contamination`` most abnormal samples in + ``decision_scores_``. The threshold is calculated for generating + binary outlier labels. + + labels_ : int, either 0 or 1 + The binary labels of the training data. 0 stands for inliers + and 1 for outliers/anomalies. It is generated by applying + ``threshold_`` on ``decision_scores_``. + """ + + def __init__(self, window_size, step_size=1, contamination=0.1, + n_components=None, n_selected_components=None, + copy=True, whiten=False, svd_solver='auto', + tol=0.0, iterated_power='auto', random_state=None, + weighted=True, standardization=True): + super(PCA, self).__init__(contamination=contamination) + self.window_size = window_size + self.step_size = step_size + + # parameters for PCA + self.n_components = n_components + self.n_selected_components = n_selected_components + self.copy = copy + self.whiten = whiten + self.svd_solver = svd_solver + self.tol = tol + self.iterated_power = iterated_power + self.random_state = random_state + self.weighted = weighted + self.standardization = standardization + + # initialize a kNN model + self.model_ = PCA_PYOD(n_components=self.n_components, + n_selected_components=self.n_selected_components, + contamination=self.contamination, + copy=self.copy, + whiten=self.whiten, + svd_solver=self.svd_solver, + tol=self.tol, + iterated_power=self.iterated_power, + random_state=self.random_state, + weighted=self.weighted, + standardization=self.standardization) + + def fit(self, X: np.array) -> object: + """Fit detector. y is ignored in unsupervised methods. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. + + y : Ignored + Not used, present for API consistency by convention. + + Returns + ------- + self : object + Fitted estimator. + """ + X = check_array(X).astype(np.float) + + # first convert it into submatrices, and flatten it + sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True, + flatten_order='F') + + # if self.n_components > sub_matrices.shape[1]: + # raise ValueError('n_components exceeds window_size times the number of sequences.') + + # fit the PCA model + self.model_.fit(sub_matrices) + self.decision_scores_ = self.model_.decision_scores_ + self._process_decision_scores() + return self + + def decision_function(self, X: np.array): + """Predict raw anomaly scores of X using the fitted detector. + + The anomaly score of an input sample is computed based on the fitted + detector. For consistency, outliers are assigned with + higher anomaly scores. + + Parameters + ---------- + X : numpy array of shape (n_samples, n_features) + The input samples. Sparse matrices are accepted only + if they are supported by the base estimator. + + Returns + ------- + anomaly_scores : numpy array of shape (n_samples,) + The anomaly score of the input samples. + """ + check_is_fitted(self, ['model_']) + X = check_array(X).astype(np.float) + # first convert it into submatrices, and flatten it + sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( + X, + self.window_size, + self.step_size, + return_numpy=True, + flatten=True, + flatten_order='F') + + # return the prediction result by PCA + return self.model_.decision_function( + sub_matrices), X_left_inds.ravel(), X_right_inds.ravel() + + +if __name__ == "__main__": # pragma: no cover + # X_train = np.asarray( + # [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) + + # X_test = np.asarray( + # [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, + # 1) + + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], + [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + w = get_sub_matrices(X_train, window_size=3, step=2, flatten=False) + X_test = np.asarray( + [[12., 10], [8., 12], [80., 80], [92., 983], + [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], + [22., 15]]) + + clf = PCA(window_size=3, step_size=2, contamination=0.2) + + clf.fit(X_train) + decision_scores, left_inds_, right_inds = clf.decision_scores_, \ + clf.left_inds_, clf.right_inds_ + print(clf.left_inds_, clf.right_inds_) + pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) + pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) + pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) + + print(pred_scores) + print(pred_labels) + print(pred_probs) diff --git a/tods/detection_algorithm/core/SODCommonTest.py b/tods/detection_algorithm/core/SODCommonTest.py new file mode 100755 index 00000000..2658e971 --- /dev/null +++ b/tods/detection_algorithm/core/SODCommonTest.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import numpy as np +import unittest +# noinspection PyProtectedMember +from numpy.testing import assert_equal +from numpy.testing import assert_allclose +from numpy.testing import assert_array_less +from numpy.testing import assert_raises + +from unittest import TestCase + +from sklearn.utils.estimator_checks import check_estimator + +from sklearn.metrics import roc_auc_score +from scipy.stats import rankdata + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pyod.utils.data import generate_data + +_dummy = TestCase('__init__') +assert_greater = _dummy.assertGreater +assert_greater_equal = _dummy.assertGreaterEqual +assert_less = _dummy.assertLess +assert_less_equal = _dummy.assertLessEqual + + +class SODCommonTest: + def __init__(self, + model, + X_train, + y_train, + X_test, + y_test, + roc_floor, + ): + self.clf = model + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + self.roc_floor = roc_floor + + self.clf.fit(X=self.X_train, y=self.y_train) + + pass + + def test_detector(self): + + self.test_parameters() + self.test_train_scores() + self.test_prediction_scores() + self.test_prediction_proba() + #self.test_prediction_proba_linear() + #self.test_prediction_proba_unify() + #self.test_prediction_proba_parameter() + # self.test_fit_predict() + # self.test_fit_predict_score() + self.test_prediction_labels() + # self.test_predict_rank() + # self.test_predict_rank_normalized() + self.tearDown() + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + #assert (hasattr(self.clf, 'threshold_') and + # self.clf.threshold_ is not None) + #assert (hasattr(self.clf, '_mu') and + # self.clf._mu is not None) + #assert (hasattr(self.clf, '_sigma') and + # self.clf._sigma is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) + + def test_prediction_scores(self): + pred_scores = self.clf.decision_function(self.X_test) + + # check score shapes + assert_equal(pred_scores.shape[0], self.y_test.shape[0]) + + # check performance + assert_greater_equal(roc_auc_score(self.y_test, pred_scores), self.roc_floor) + + def test_prediction_labels(self): + pred_labels = self.clf.predict(self.X_test) + self.y_test = np.squeeze(self.y_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_proba(self): + pred_proba = self.clf.predict_proba(self.X_test) + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_linear(self): + pred_proba = self.clf.predict_proba(self.X_test, method='linear') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_unify(self): + pred_proba = self.clf.predict_proba(self.X_test, method='unify') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_fit_predict(self): # pragma: no cover + pred_labels = self.clf.fit_predict(X=self.X_train, y=self.y_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): # pragma: no cover + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_predict_rank(self): # pragma: no cover + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, self.X_train.shape[0] + 1) + assert_array_less(-0.1, pred_ranks) + + def test_predict_rank_normalized(self): # pragma: no cover + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, 1.01) + assert_array_less(-0.1, pred_ranks) + + def tearDown(self): + pass diff --git a/tods/detection_algorithm/core/UODCommonTest.py b/tods/detection_algorithm/core/UODCommonTest.py new file mode 100755 index 00000000..4e9f7c48 --- /dev/null +++ b/tods/detection_algorithm/core/UODCommonTest.py @@ -0,0 +1,153 @@ +# -*- coding: utf-8 -*- + +from __future__ import division +from __future__ import print_function + +import os +import sys + +import numpy as np +import unittest +# noinspection PyProtectedMember +from numpy.testing import assert_equal +from numpy.testing import assert_allclose +from numpy.testing import assert_array_less +from numpy.testing import assert_raises + +from unittest import TestCase + +from sklearn.utils.estimator_checks import check_estimator + +from sklearn.metrics import roc_auc_score +from scipy.stats import rankdata + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from pyod.utils.data import generate_data + +_dummy = TestCase('__init__') +assert_greater = _dummy.assertGreater +assert_greater_equal = _dummy.assertGreaterEqual +assert_less = _dummy.assertLess +assert_less_equal = _dummy.assertLessEqual + + +class UODCommonTest: + def __init__(self, + model, + X_train, + y_train, + X_test, + y_test, + roc_floor, + ): + self.clf = model + self.X_train = X_train + self.y_train = y_train + self.X_test = X_test + self.y_test = y_test + self.roc_floor = roc_floor + + self.clf.fit(self.X_train) + + pass + + def test_detector(self): + + self.test_parameters() + self.test_train_scores() + self.test_prediction_scores() + self.test_prediction_proba() + self.test_prediction_proba_linear() + self.test_prediction_proba_unify() + self.test_prediction_proba_parameter() + # self.test_fit_predict() + # self.test_fit_predict_score() + self.test_prediction_labels() + # self.test_predict_rank() + # self.test_predict_rank_normalized() + self.tearDown() + + def test_parameters(self): + assert (hasattr(self.clf, 'decision_scores_') and + self.clf.decision_scores_ is not None) + assert (hasattr(self.clf, 'labels_') and + self.clf.labels_ is not None) + assert (hasattr(self.clf, 'threshold_') and + self.clf.threshold_ is not None) + assert (hasattr(self.clf, '_mu') and + self.clf._mu is not None) + assert (hasattr(self.clf, '_sigma') and + self.clf._sigma is not None) + + def test_train_scores(self): + assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) + + def test_prediction_scores(self): + pred_scores = self.clf.decision_function(self.X_test) + + # check score shapes + assert_equal(pred_scores.shape[0], self.y_test.shape[0]) + + # check performance + assert_greater_equal(roc_auc_score(self.y_test, pred_scores), self.roc_floor) + + def test_prediction_labels(self): + pred_labels = self.clf.predict(self.X_test) + assert_equal(pred_labels.shape, self.y_test.shape) + + def test_prediction_proba(self): + pred_proba = self.clf.predict_proba(self.X_test) + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_linear(self): + pred_proba = self.clf.predict_proba(self.X_test, method='linear') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_unify(self): + pred_proba = self.clf.predict_proba(self.X_test, method='unify') + assert_greater_equal(pred_proba.min(), 0) + assert_less_equal(pred_proba.max(), 1) + + def test_prediction_proba_parameter(self): + with assert_raises(ValueError): + self.clf.predict_proba(self.X_test, method='something') + + def test_fit_predict(self): # pragma: no cover + pred_labels = self.clf.fit_predict(X=self.X_train) + assert_equal(pred_labels.shape, self.y_train.shape) + + def test_fit_predict_score(self): # pragma: no cover + self.clf.fit_predict_score(self.X_test, self.y_test) + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='roc_auc_score') + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='prc_n_score') + with assert_raises(NotImplementedError): + self.clf.fit_predict_score(self.X_test, self.y_test, + scoring='something') + + def test_predict_rank(self): # pragma: no cover + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, self.X_train.shape[0] + 1) + assert_array_less(-0.1, pred_ranks) + + def test_predict_rank_normalized(self): # pragma: no cover + pred_socres = self.clf.decision_function(self.X_test) + pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) + + # assert the order is reserved + assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) + assert_array_less(pred_ranks, 1.01) + assert_array_less(-0.1, pred_ranks) + + def tearDown(self): + pass diff --git a/tods/detection_algorithm/core/__init__.py b/tods/detection_algorithm/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-37.pyc new file mode 100644 index 00000000..bab47647 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-38.pyc new file mode 100644 index 00000000..3b45aa88 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/AutoRegOD.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-37.pyc new file mode 100644 index 00000000..9ef916bb Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-38.pyc new file mode 100644 index 00000000..4a6f1986 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/CollectiveBase.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/CollectiveCommonTest.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/CollectiveCommonTest.cpython-37.pyc new file mode 100644 index 00000000..50c42bb0 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/CollectiveCommonTest.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-37.pyc new file mode 100644 index 00000000..af285f3e Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-38.pyc new file mode 100644 index 00000000..817d58ff Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/KDiscord.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-37.pyc new file mode 100644 index 00000000..f624cd8c Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-38.pyc new file mode 100644 index 00000000..bde8b582 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/LSTMOD.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-37.pyc new file mode 100644 index 00000000..2b5942ec Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-38.pyc new file mode 100644 index 00000000..a6336092 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/MultiAutoRegOD.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/PCA.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/PCA.cpython-37.pyc new file mode 100644 index 00000000..62a6e6ae Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/PCA.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/PCA.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/PCA.cpython-38.pyc new file mode 100644 index 00000000..94fdf8db Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/PCA.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/SODCommonTest.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/SODCommonTest.cpython-37.pyc new file mode 100644 index 00000000..0fe18a63 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/SODCommonTest.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/UODCommonTest.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/UODCommonTest.cpython-37.pyc new file mode 100644 index 00000000..80088633 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/UODCommonTest.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/__init__.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 00000000..63c86b3e Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/__init__.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/__init__.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..88663d0e Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/__init__.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/test_CollectiveBase.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/test_CollectiveBase.cpython-37.pyc new file mode 100644 index 00000000..2fb6a3d0 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/test_CollectiveBase.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/utility.cpython-37.pyc b/tods/detection_algorithm/core/__pycache__/utility.cpython-37.pyc new file mode 100644 index 00000000..6fd992c6 Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/utility.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/__pycache__/utility.cpython-38.pyc b/tods/detection_algorithm/core/__pycache__/utility.cpython-38.pyc new file mode 100644 index 00000000..f81f8d5f Binary files /dev/null and b/tods/detection_algorithm/core/__pycache__/utility.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/algorithm_implementation.py b/tods/detection_algorithm/core/algorithm_implementation.py new file mode 100644 index 00000000..e69de29b diff --git a/tods/detection_algorithm/core/dagmm/__init__.py b/tods/detection_algorithm/core/dagmm/__init__.py new file mode 100644 index 00000000..cc055001 --- /dev/null +++ b/tods/detection_algorithm/core/dagmm/__init__.py @@ -0,0 +1,6 @@ +# -*- coding: utf-8 -*- + +from .compression_net import CompressionNet +from .estimation_net import EstimationNet +from .gmm import GMM +from .dagmm import DAGMM diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-37.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 00000000..b90a42fd Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-38.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..9f412d8e Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/__init__.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-37.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-37.pyc new file mode 100644 index 00000000..a0504b7b Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-38.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-38.pyc new file mode 100644 index 00000000..f0b74b2a Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/compression_net.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-37.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-37.pyc new file mode 100644 index 00000000..ab5f8cd2 Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-38.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-38.pyc new file mode 100644 index 00000000..4bef0dda Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/dagmm.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-37.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-37.pyc new file mode 100644 index 00000000..ece74703 Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-38.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-38.pyc new file mode 100644 index 00000000..95d774b8 Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/estimation_net.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-37.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-37.pyc new file mode 100644 index 00000000..30eca59a Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-38.pyc b/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-38.pyc new file mode 100644 index 00000000..ce98c971 Binary files /dev/null and b/tods/detection_algorithm/core/dagmm/__pycache__/gmm.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/dagmm/compression_net.py b/tods/detection_algorithm/core/dagmm/compression_net.py new file mode 100644 index 00000000..759dbdc0 --- /dev/null +++ b/tods/detection_algorithm/core/dagmm/compression_net.py @@ -0,0 +1,121 @@ +import tensorflow as tf +import tensorflow.compat.v1 as tf +tf.disable_v2_behavior() + +class CompressionNet: + """ Compression Network. + This network converts the input data to the representations + suitable for calculation of anormaly scores by "Estimation Network". + + Outputs of network consist of next 2 components: + 1) reduced low-dimensional representations learned by AutoEncoder. + 2) the features derived from reconstruction error. + """ + def __init__(self, hidden_layer_sizes, activation=tf.nn.tanh): + """ + Parameters + ---------- + hidden_layer_sizes : list of int + list of the size of hidden layers. + For example, if the sizes are [n1, n2], + the sizes of created networks are: + input_size -> n1 -> n2 -> n1 -> input_sizes + (network outputs the representation of "n2" layer) + activation : function + activation function of hidden layer. + the last layer uses linear function. + """ + self.hidden_layer_sizes = hidden_layer_sizes + self.activation = activation + + def compress(self, x): + self.input_size = x.shape[1] + + with tf.variable_scope("Encoder"): + z = x + n_layer = 0 + for size in self.hidden_layer_sizes[:-1]: + n_layer += 1 + z = tf.layers.dense(z, size, activation=self.activation, + name="layer_{}".format(n_layer)) + + # activation function of last layer is linear + n_layer += 1 + z = tf.layers.dense(z, self.hidden_layer_sizes[-1], + name="layer_{}".format(n_layer)) + + return z + + def reverse(self, z): + with tf.variable_scope("Decoder"): + n_layer = 0 + for size in self.hidden_layer_sizes[:-1][::-1]: + n_layer += 1 + z = tf.layers.dense(z, size, activation=self.activation, + name="layer_{}".format(n_layer)) + + # activation function of last layes is linear + n_layer += 1 + x_dash = tf.layers.dense(z, self.input_size, + name="layer_{}".format(n_layer)) + + return x_dash + + def loss(self, x, x_dash): + def euclid_norm(x): + return tf.sqrt(tf.reduce_sum(tf.square(x), axis=1)) + + # Calculate Euclid norm, distance + norm_x = euclid_norm(x) + norm_x_dash = euclid_norm(x_dash) + dist_x = euclid_norm(x - x_dash) + dot_x = tf.reduce_sum(x * x_dash, axis=1) + + # Based on the original paper, features of reconstraction error + # are composed of these loss functions: + # 1. loss_E : relative Euclidean distance + # 2. loss_C : cosine similarity + min_val = 1e-3 + loss_E = dist_x / (norm_x + min_val) + loss_C = 0.5 * (1.0 - dot_x / (norm_x * norm_x_dash + min_val)) + return tf.concat([loss_E[:,None], loss_C[:,None]], axis=1) + + def extract_feature(self, x, x_dash, z_c): + z_r = self.loss(x, x_dash) + return tf.concat([z_c, z_r], axis=1) + + def inference(self, x): + """ convert input to output tensor, which is composed of + low-dimensional representation and reconstruction error. + + Parameters + ---------- + x : tf.Tensor shape : (n_samples, n_features) + Input data + + Results + ------- + z : tf.Tensor shape : (n_samples, n2 + 2) + Result data + Second dimension of this data is equal to + sum of compressed representation size and + number of loss function (=2) + + x_dash : tf.Tensor shape : (n_samples, n_features) + Reconstructed data for calculation of + reconstruction error. + """ + + with tf.variable_scope("CompNet"): + # AutoEncoder + z_c = self.compress(x) + x_dash = self.reverse(z_c) + + # compose feature vector + z = self.extract_feature(x, x_dash, z_c) + + return z, x_dash + + def reconstruction_error(self, x, x_dash): + return tf.reduce_mean(tf.reduce_sum( + tf.square(x - x_dash), axis=1), axis=0) diff --git a/tods/detection_algorithm/core/dagmm/dagmm.py b/tods/detection_algorithm/core/dagmm/dagmm.py new file mode 100644 index 00000000..02f882f1 --- /dev/null +++ b/tods/detection_algorithm/core/dagmm/dagmm.py @@ -0,0 +1,251 @@ +import tensorflow as tf +import numpy as np +from sklearn.preprocessing import StandardScaler +import joblib + +from .compression_net import CompressionNet +from .estimation_net import EstimationNet +from .gmm import GMM +from pyod.utils.stat_models import pairwise_distances_no_broadcast + +from os import makedirs +from os.path import exists, join +import tensorflow.compat.v1 as tf +tf.disable_v2_behavior() + +from pyod.models.base import BaseDetector + +class DAGMM(BaseDetector): + """ Deep Autoencoding Gaussian Mixture Model. + + This implementation is based on the paper: + Bo Zong+ (2018) Deep Autoencoding Gaussian Mixture Model + for Unsupervised Anomaly Detection, ICLR 2018 + (this is UNOFFICIAL implementation) + """ + + MODEL_FILENAME = "DAGMM_model" + SCALER_FILENAME = "DAGMM_scaler" + + def __init__(self, comp_hiddens:list = [16,8,1], + est_hiddens:list = [8,4], est_dropout_ratio:float =0.5, + minibatch_size:int = 1024, epoch_size:int =100, + learning_rate:float =0.0001, lambda1:float =0.1, lambda2:float =0.0001, + normalize:bool=True, random_seed:int=123 , contamination:float = 0.001 ): + """ + Parameters + ---------- + comp_hiddens : list of int + sizes of hidden layers of compression network + For example, if the sizes are [n1, n2], + structure of compression network is: + input_size -> n1 -> n2 -> n1 -> input_sizes + + est_hiddens : list of int + sizes of hidden layers of estimation network. + The last element of this list is assigned as n_comp. + For example, if the sizes are [n1, n2], + structure of estimation network is: + input_size -> n1 -> n2 (= n_comp) + + est_dropout_ratio : float (optional) + dropout ratio of estimation network applied during training + if 0 or None, dropout is not applied. + minibatch_size: int (optional) + mini batch size during training + epoch_size : int (optional) + epoch size during training + learning_rate : float (optional) + learning rate during training + lambda1 : float (optional) + a parameter of loss function (for energy term) + lambda2 : float (optional) + a parameter of loss function + (for sum of diagonal elements of covariance) + normalize : bool (optional) + specify whether input data need to be normalized. + by default, input data is normalized. + random_seed : int (optional) + random seed used when fit() is called. + """ + est_activation = tf.nn.tanh + comp_activation = tf.nn.tanh + super(DAGMM, self).__init__(contamination=contamination) + self.comp_net = CompressionNet(comp_hiddens, comp_activation) + self.est_net = EstimationNet(est_hiddens, est_activation) + self.est_dropout_ratio = est_dropout_ratio + + n_comp = est_hiddens[-1] + self.gmm = GMM(n_comp) + + self.minibatch_size = minibatch_size + self.epoch_size = epoch_size + self.learning_rate = learning_rate + self.lambda1 = lambda1 + self.lambda2 = lambda2 + + self.normalize = normalize + self.scaler = None + self.seed = random_seed + + self.graph = None + self.sess = None + + #def __del__(self): + # if self.sess is not None: + # self.sess.close() + + def fit(self,X,y=None): + """ Fit the DAGMM model according to the given data. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Training data. + """ + + n_samples, n_features = X.shape + + if self.normalize: + self.scaler = scaler = StandardScaler() + X = scaler.fit_transform(X) + + with tf.Graph().as_default() as graph: + self.graph = graph + tf.set_random_seed(self.seed) + np.random.seed(seed=self.seed) + + # Create Placeholder + self.input = input = tf.placeholder( + dtype=tf.float32, shape=[None, n_features]) + self.drop = drop = tf.placeholder(dtype=tf.float32, shape=[]) + + # Build graph + z, x_dash = self.comp_net.inference(input) + gamma = self.est_net.inference(z, drop) + self.gmm.fit(z, gamma) + energy = self.gmm.energy(z) + + self.x_dash = x_dash + + # Loss function + loss = (self.comp_net.reconstruction_error(input, x_dash) + + self.lambda1 * tf.reduce_mean(energy) + + self.lambda2 * self.gmm.cov_diag_loss()) + + # Minimizer + minimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) + + # Number of batch + n_batch = (n_samples - 1) // self.minibatch_size + 1 + + # Create tensorflow session and initilize + init = tf.global_variables_initializer() + + self.sess = tf.Session(graph=graph) + self.sess.run(init) + + # Training + idx = np.arange(X.shape[0]) + np.random.shuffle(idx) + + for epoch in range(self.epoch_size): + for batch in range(n_batch): + i_start = batch * self.minibatch_size + i_end = (batch + 1) * self.minibatch_size + x_batch = X[idx[i_start:i_end]] + + self.sess.run(minimizer, feed_dict={ + input:x_batch, drop:self.est_dropout_ratio}) + if (epoch + 1) % 10 == 0: + loss_val = self.sess.run(loss, feed_dict={input:X, drop:0}) + print(" epoch {}/{} : loss = {:.3f}".format(epoch + 1, self.epoch_size, loss_val)) + + # Fix GMM parameter + fix = self.gmm.fix_op() + self.sess.run(fix, feed_dict={input:X, drop:0}) + self.energy = self.gmm.energy(z) + + tf.add_to_collection("save", self.input) + tf.add_to_collection("save", self.energy) + + self.saver = tf.train.Saver() + + pred_scores = self.decision_function(X) + self.decision_scores_ = pred_scores + self._process_decision_scores() + #return self + + def decision_function(self, X): + """ Calculate anormaly scores (sample energy) on samples in X. + + Parameters + ---------- + X : array-like, shape (n_samples, n_features) + Data for which anomaly scores are calculated. + n_features must be equal to n_features of the fitted data. + + Returns + ------- + energies : array-like, shape (n_samples) + Calculated sample energies. + """ + if self.sess is None: + raise Exception("Trained model does not exist.") + + if self.normalize: + X = self.scaler.transform(X) + + energies = self.sess.run(self.energy, feed_dict={self.input:X}) + + return energies.reshape(1,-1) + + def save(self, fdir): + """ Save trained model to designated directory. + This method have to be called after training. + (If not, throw an exception) + + Parameters + ---------- + fdir : str + Path of directory trained model is saved. + If not exists, it is created automatically. + """ + if self.sess is None: + raise Exception("Trained model does not exist.") + + if not exists(fdir): + makedirs(fdir) + + model_path = join(fdir, self.MODEL_FILENAME) + self.saver.save(self.sess, model_path) + + if self.normalize: + scaler_path = join(fdir, self.SCALER_FILENAME) + joblib.dump(self.scaler, scaler_path) + + def restore(self, fdir): + """ Restore trained model from designated directory. + + Parameters + ---------- + fdir : str + Path of directory trained model is saved. + """ + if not exists(fdir): + raise Exception("Model directory does not exist.") + + model_path = join(fdir, self.MODEL_FILENAME) + meta_path = model_path + ".meta" + + with tf.Graph().as_default() as graph: + self.graph = graph + self.sess = tf.Session(graph=graph) + self.saver = tf.train.import_meta_graph(meta_path) + self.saver.restore(self.sess, model_path) + + self.input, self.energy = tf.get_collection("save") + + if self.normalize: + scaler_path = join(fdir, self.SCALER_FILENAME) + self.scaler = joblib.load(scaler_path) diff --git a/tods/detection_algorithm/core/dagmm/estimation_net.py b/tods/detection_algorithm/core/dagmm/estimation_net.py new file mode 100644 index 00000000..6dbe7cc6 --- /dev/null +++ b/tods/detection_algorithm/core/dagmm/estimation_net.py @@ -0,0 +1,63 @@ +# -*- coding: utf-8 -*- +import tensorflow as tf +import tensorflow.compat.v1 as tf +tf.disable_v2_behavior() + +class EstimationNet: + """ Estimation Network + + This network converts input feature vector to softmax probability. + Bacause loss function for this network is not defined, + it should be implemented outside of this class. + """ + def __init__(self, hidden_layer_sizes, activation=tf.nn.relu): + """ + Parameters + ---------- + hidden_layer_sizes : list of int + list of sizes of hidden layers. + For example, if the sizes are [n1, n2], + layer sizes of the network are: + input_size -> n1 -> n2 + (network outputs the softmax probabilities of "n2" layer) + activation : function + activation function of hidden layer. + the funtcion of last layer is softmax function. + """ + self.hidden_layer_sizes = hidden_layer_sizes + self.activation = activation + + def inference(self, z, dropout_ratio=None): + """ Output softmax probabilities + + Parameters + ---------- + z : tf.Tensor shape : (n_samples, n_features) + Data inferenced by this network + dropout_ratio : tf.Tensor shape : 0-dimension float (optional) + Specify dropout ratio + (if None, dropout is not applied) + + Results + ------- + probs : tf.Tensor shape : (n_samples, n_classes) + Calculated probabilities + """ + with tf.variable_scope("EstNet"): + n_layer = 0 + for size in self.hidden_layer_sizes[:-1]: + n_layer += 1 + z = tf.layers.dense(z, size, activation=self.activation, + name="layer_{}".format(n_layer)) + if dropout_ratio is not None: + z = tf.layers.dropout(z, dropout_ratio, + name="drop_{}".format(n_layer)) + + # Last layer uses linear function (=logits) + size = self.hidden_layer_sizes[-1] + logits = tf.layers.dense(z, size, activation=None, name="logits") + + # Softmax output + output = tf.nn.softmax(logits) + + return output diff --git a/tods/detection_algorithm/core/dagmm/gmm.py b/tods/detection_algorithm/core/dagmm/gmm.py new file mode 100644 index 00000000..6da40700 --- /dev/null +++ b/tods/detection_algorithm/core/dagmm/gmm.py @@ -0,0 +1,130 @@ +# -*- coding: utf-8 -*- +import numpy as np +import tensorflow as tf +import tensorflow.compat.v1 as tf +tf.disable_v2_behavior() +class GMM: + """ Gaussian Mixture Model (GMM) """ + def __init__(self, n_comp): + self.n_comp = n_comp + self.phi = self.mu = self.sigma = None + self.training = False + + def create_variables(self, n_features): + with tf.variable_scope("GMM"): + phi = tf.Variable(tf.zeros(shape=[self.n_comp]), + dtype=tf.float32, name="phi") + mu = tf.Variable(tf.zeros(shape=[self.n_comp, n_features]), + dtype=tf.float32, name="mu") + sigma = tf.Variable(tf.zeros( + shape=[self.n_comp, n_features, n_features]), + dtype=tf.float32, name="sigma") + L = tf.Variable(tf.zeros( + shape=[self.n_comp, n_features, n_features]), + dtype=tf.float32, name="L") + + return phi, mu, sigma, L + + def fit(self, z, gamma): + """ fit data to GMM model + + Parameters + ---------- + z : tf.Tensor, shape (n_samples, n_features) + data fitted to GMM. + gamma : tf.Tensor, shape (n_samples, n_comp) + probability. each row is correspond to row of z. + """ + + with tf.variable_scope("GMM"): + # Calculate mu, sigma + # i : index of samples + # k : index of components + # l,m : index of features + gamma_sum = tf.reduce_sum(gamma, axis=0) + self.phi = phi = tf.reduce_mean(gamma, axis=0) + self.mu = mu = tf.einsum('ik,il->kl', gamma, z) / gamma_sum[:,None] + z_centered = tf.sqrt(gamma[:,:,None]) * (z[:,None,:] - mu[None,:,:]) + self.sigma = sigma = tf.einsum( + 'ikl,ikm->klm', z_centered, z_centered) / gamma_sum[:,None,None] + + # Calculate a cholesky decomposition of covariance in advance + n_features = z.shape[1] + min_vals = tf.diag(tf.ones(n_features, dtype=tf.float32)) * 1e-6 + self.L = tf.cholesky(sigma + min_vals[None,:,:]) + + self.training = False + return self + + def fix_op(self): + """ return operator to fix paramters of GMM + Using this operator outside of this class, + you can fix current parameter to static tensor variable. + + After you call this method, you have to run result + operator immediatelly, and call energy() to use static + variables of model parameter. + + Returns + ------- + op : operator of tensorflow + operator to assign current parameter to variables + """ + + phi, mu, sigma, L = self.create_variables(self.mu.shape[1]) + + op = tf.group( + tf.assign(phi, self.phi), + tf.assign(mu, self.mu), + tf.assign(sigma, self.sigma), + tf.assign(L, self.L) + ) + + self.phi, self.phi_org = phi, self.phi + self.mu, self.mu_org = mu, self.mu + self.sigma, self.sigma_org = sigma, self.sigma + self.L, self.L_org = L, self.L + + self.training = False + + return op + + def energy(self, z): + """ calculate an energy of each row of z + + Parameters + ---------- + z : tf.Tensor, shape (n_samples, n_features) + data each row of which is calculated its energy. + + Returns + ------- + energy : tf.Tensor, shape (n_samples) + calculated energies + """ + + if self.training and self.phi is None: + self.phi, self.mu, self.sigma, self.L = self.create_variable(z.shape[1]) + + with tf.variable_scope("GMM_energy"): + # Instead of inverse covariance matrix, exploit cholesky decomposition + # for stability of calculation. + z_centered = z[:,None,:] - self.mu[None,:,:] #ikl + v = tf.matrix_triangular_solve(self.L, tf.transpose(z_centered, [1, 2, 0])) # kli + + # log(det(Sigma)) = 2 * sum[log(diag(L))] + log_det_sigma = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.L)), axis=1) + + # To calculate energies, use "log-sum-exp" (different from orginal paper) + d = z.get_shape().as_list()[1] + logits = tf.log(self.phi[:,None]) - 0.5 * (tf.reduce_sum(tf.square(v), axis=1) + + d * tf.log(2.0 * np.pi) + log_det_sigma[:,None]) + energies = - tf.reduce_logsumexp(logits, axis=0) + + return energies + + def cov_diag_loss(self): + with tf.variable_scope("GMM_diag_loss"): + diag_loss = tf.reduce_sum(tf.divide(1, tf.matrix_diag_part(self.sigma))) + + return diag_loss diff --git a/tods/detection_algorithm/core/test_CollectiveBase.py b/tods/detection_algorithm/core/test_CollectiveBase.py new file mode 100644 index 00000000..92e6ff92 --- /dev/null +++ b/tods/detection_algorithm/core/test_CollectiveBase.py @@ -0,0 +1,211 @@ +# -*- coding: utf-8 -*- +from __future__ import division # pragma: no cover +from __future__ import print_function # pragma: no cover + +import os # pragma: no cover +import sys # pragma: no cover + +import unittest # pragma: no cover +from sklearn.utils.testing import assert_equal # pragma: no cover +from sklearn.utils.testing import assert_raises # pragma: no cover + +import numpy as np # pragma: no cover + +# temporary solution for relative imports in case pyod is not installed +# if pyod is installed, no need to use the following line +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # pragma: no cover + +from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector # pragma: no cover +from pyod.utils.data import generate_data # pragma: no cover + + +# Check sklearn\tests\test_base +# A few test classes +# noinspection PyMissingConstructor,PyPep8Naming +class MyEstimator(CollectiveBaseDetector): # pragma: no cover + + def __init__(self, l1=0, empty=None): # pragma: no cover + self.l1 = l1 + self.empty = empty + + def fit(self, X, y=None): # pragma: no cover + pass + + def decision_function(self, X): # pragma: no cover + pass + + +# noinspection PyMissingConstructor +class K(CollectiveBaseDetector): # pragma: no cover + def __init__(self, c=None, d=None): # pragma: no cover + self.c = c + self.d = d + + def fit(self, X, y=None): # pragma: no cover + pass + + def decision_function(self, X): # pragma: no cover + pass + + +# noinspection PyMissingConstructor +class T(CollectiveBaseDetector): # pragma: no cover + def __init__(self, a=None, b=None): # pragma: no cover + self.a = a + self.b = b + + def fit(self, X, y=None): # pragma: no cover + pass + + def decision_function(self, X): # pragma: no cover + pass + + +# noinspection PyMissingConstructor +class ModifyInitParams(CollectiveBaseDetector): # pragma: no cover + """Deprecated behavior. + Equal parameters but with a type cast. + Doesn't fulfill a is a + """ + + def __init__(self, a=np.array([0])): # pragma: no cover + self.a = a.copy() + + def fit(self, X, y=None): # pragma: no cover + pass + + def decision_function(self, X): # pragma: no cover + pass + + +# noinspection PyMissingConstructor +class VargEstimator(CollectiveBaseDetector): # pragma: no cover + """scikit-learn estimators shouldn't have vargs.""" + + def __init__(self, *vargs): # pragma: no cover + pass + + def fit(self, X, y=None): # pragma: no cover + pass + + def decision_function(self, X): # pragma: no cover + pass + + +class Dummy1(CollectiveBaseDetector): # pragma: no cover + def __init__(self, contamination=0.1): # pragma: no cover + super(Dummy1, self).__init__(contamination=contamination) + + def decision_function(self, X): # pragma: no cover + pass + + def fit(self, X, y=None): # pragma: no cover + pass + + +class Dummy2(CollectiveBaseDetector): # pragma: no cover + def __init__(self, contamination=0.1): # pragma: no cover + super(Dummy2, self).__init__(contamination=contamination) + + def decision_function(self, X): # pragma: no cover + pass + + def fit(self, X, y=None): # pragma: no cover + return X + + +class Dummy3(CollectiveBaseDetector): # pragma: no cover + def __init__(self, contamination=0.1): # pragma: no cover + super(Dummy3, self).__init__(contamination=contamination) + + def decision_function(self, X): # pragma: no cover + pass + + def fit(self, X, y=None): # pragma: no cover + self.labels_ = X + + +class TestBASE(unittest.TestCase): # pragma: no cover + def setUp(self): # pragma: no cover + self.n_train = 100 + self.n_test = 50 + self.contamination = 0.1 + self.roc_floor = 0.6 + self.X_train, self.y_train, self.X_test, self.y_test = generate_data( + n_train=self.n_train, n_test=self.n_test, + contamination=self.contamination) + + def test_init(self): # pragma: no cover + """ + Test base class initialization + + :return: + """ + self.dummy_clf = Dummy1() + assert_equal(self.dummy_clf.contamination, 0.1) + + self.dummy_clf = Dummy1(contamination=0.2) + assert_equal(self.dummy_clf.contamination, 0.2) + + with assert_raises(ValueError): + Dummy1(contamination=0.51) + + with assert_raises(ValueError): + Dummy1(contamination=0) + + with assert_raises(ValueError): + Dummy1(contamination=-0.5) + + def test_fit(self): # pragma: no cover + self.dummy_clf = Dummy2() + assert_equal(self.dummy_clf.fit(0), 0) + + def test_fit_predict(self): # pragma: no cover + # TODO: add more testcases + + self.dummy_clf = Dummy3() + + assert_equal(self.dummy_clf.fit_predict(0), 0) + + def test_predict_proba(self): # pragma: no cover + # TODO: create uniform testcases + pass + + def test_rank(self): # pragma: no cover + # TODO: create uniform testcases + pass + + def test_repr(self): # pragma: no cover + # Smoke test the repr of the base estimator. + my_estimator = MyEstimator() + repr(my_estimator) + test = T(K(), K()) + assert_equal( + repr(test), + "T(a=K(c=None, d=None), b=K(c=None, d=None))" + ) + + some_est = T(a=["long_params"] * 1000) + assert_equal(len(repr(some_est)), 415) + + def test_str(self): # pragma: no cover + # Smoke test the str of the base estimator + my_estimator = MyEstimator() + str(my_estimator) + + def test_get_params(self): # pragma: no cover + test = T(K(), K()) + + assert ('a__d' in test.get_params(deep=True)) + assert ('a__d' not in test.get_params(deep=False)) + + test.set_params(a__d=2) + assert (test.a.d == 2) + assert_raises(ValueError, test.set_params, a__a=2) + + def tearDown(self): # pragma: no cover + pass + + +if __name__ == '__main__': # pragma: no cover + unittest.main() diff --git a/tods/detection_algorithm/core/utility.py b/tods/detection_algorithm/core/utility.py new file mode 100644 index 00000000..a486b995 --- /dev/null +++ b/tods/detection_algorithm/core/utility.py @@ -0,0 +1,179 @@ +# -*- coding: utf-8 -*- +"""Utility functions for supporting time-series based outlier detection. +""" + +import numpy as np +from sklearn.utils import check_array + + +# def get_sub_sequences(X, window_size, step=1): +# """Chop a univariate time series into sub sequences. + +# Parameters +# ---------- +# X : numpy array of shape (n_samples,) +# The input samples. + +# window_size : int +# The moving window size. + +# step_size : int, optional (default=1) +# The displacement for moving window. + +# Returns +# ------- +# X_sub : numpy array of shape (valid_len, window_size) +# The numpy matrix with each row stands for a subsequence. +# """ +# X = check_array(X).astype(np.float) +# n_samples = len(X) + +# # get the valid length +# valid_len = get_sub_sequences_length(n_samples, window_size, step) + +# X_sub = np.zeros([valid_len, window_size]) +# # y_sub = np.zeros([valid_len, 1]) + +# # exclude the edge +# steps = list(range(0, n_samples, step)) +# steps = steps[:valid_len] + +# for idx, i in enumerate(steps): +# X_sub[idx,] = X[i: i + window_size].ravel() + +# return X_sub + +def get_sub_matrices(X, window_size, step=1, return_numpy=True, flatten=True, + flatten_order='F'): + """Chop a multivariate time series into sub sequences (matrices). + + Parameters + ---------- + X : numpy array of shape (n_samples,) + The input samples. + + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + return_numpy : bool, optional (default=True) + If True, return the data format in 3d numpy array. + + flatten : bool, optional (default=True) + If True, flatten the returned array in 2d. + + flatten_order : str, optional (default='F') + Decide the order of the flatten for multivarite sequences. + ‘C’ means to flatten in row-major (C-style) order. + ‘F’ means to flatten in column-major (Fortran- style) order. + ‘A’ means to flatten in column-major order if a is Fortran contiguous in memory, + row-major order otherwise. ‘K’ means to flatten a in the order the elements occur in memory. + The default is ‘F’. + + Returns + ------- + X_sub : numpy array of shape (valid_len, window_size*n_sequences) + The numpy matrix with each row stands for a flattend submatrix. + """ + X = check_array(X).astype(np.float) + n_samples, n_sequences = X.shape[0], X.shape[1] + + # get the valid length + valid_len = get_sub_sequences_length(n_samples, window_size, step) + + X_sub = [] + X_left_inds = [] + X_right_inds = [] + + # exclude the edge + steps = list(range(0, n_samples, step)) + steps = steps[:valid_len] + + # print(n_samples, n_sequences) + for idx, i in enumerate(steps): + X_sub.append(X[i: i + window_size, :]) + X_left_inds.append(i) + X_right_inds.append(i + window_size) + + X_sub = np.asarray(X_sub) + + if return_numpy: + if flatten: + temp_array = np.zeros([valid_len, window_size * n_sequences]) + if flatten_order == 'C': + for i in range(valid_len): + temp_array[i, :] = X_sub[i, :, :].flatten(order='C') + + else: + for i in range(valid_len): + temp_array[i, :] = X_sub[i, :, :].flatten(order='F') + return temp_array, np.asarray(X_left_inds), np.asarray( + X_right_inds) + + else: + return np.asarray(X_sub), np.asarray(X_left_inds), np.asarray( + X_right_inds) + else: + return X_sub, np.asarray(X_left_inds), np.asarray(X_right_inds) + + +def get_sub_sequences_length(n_samples, window_size, step): + """Pseudo chop a univariate time series into sub sequences. Return valid + length only. + + Parameters + ---------- + X : numpy array of shape (n_samples,) + The input samples. + + window_size : int + The moving window size. + + step_size : int, optional (default=1) + The displacement for moving window. + + Returns + ------- + valid_len : int + The number of subsequences. + + """ + # if X.shape[0] == 1: + # n_samples = X.shape[1] + # elif X.shape[1] == 1: + # n_samples = X.shape[0] + # else: + # raise ValueError("X is not a univarite series. The shape is {shape}.".format(shape=X.shape)) + + # valid_len = n_samples - window_size + 1 + # valida_len = int_down(n_samples-window_size)/step + 1 + valid_len = int(np.floor((n_samples - window_size) / step)) + 1 + return valid_len + + +if __name__ == "__main__": + X_train = np.asarray( + [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, + 100]).reshape(-1, 1) + + X_train = np.asarray( + [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], + [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) + + # n_samples = X.shape[0] + + window_size = 3 + + # valid_len = n_samples - window_size + 1 + + # X_sub = np.zeros([valid_len, window_size]) + + # for i in range(valid_len): + # X_sub[i, ] = X[i: i+window_size] + + # X_sub_2 = get_sub_sequences(X, window_size, step=2) + X_sub_3, X_left_inds, X_right_inds = get_sub_matrices(X_train, window_size, + step=2, + flatten_order='C') diff --git a/tods/detection_algorithm/core/utils/__init__.py b/tods/detection_algorithm/core/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-37.pyc b/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 00000000..d8616d62 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-38.pyc b/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 00000000..92ca8adf Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-37.pyc b/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-37.pyc new file mode 100644 index 00000000..76f1e6f6 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-38.pyc b/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-38.pyc new file mode 100644 index 00000000..346319c5 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/channel.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-37.pyc b/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-37.pyc new file mode 100644 index 00000000..8c5a6316 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-38.pyc b/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-38.pyc new file mode 100644 index 00000000..bc68d142 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/errors.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-37.pyc b/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-37.pyc new file mode 100644 index 00000000..6cc50df0 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-37.pyc differ diff --git a/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-38.pyc b/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-38.pyc new file mode 100644 index 00000000..189c7ee5 Binary files /dev/null and b/tods/detection_algorithm/core/utils/__pycache__/modeling.cpython-38.pyc differ diff --git a/tods/detection_algorithm/core/utils/channel.py b/tods/detection_algorithm/core/utils/channel.py new file mode 100644 index 00000000..197bee44 --- /dev/null +++ b/tods/detection_algorithm/core/utils/channel.py @@ -0,0 +1,114 @@ +import numpy as np +import os +import logging + +logger = logging.getLogger('telemanom') + + +class Channel: + def __init__(self,n_predictions,l_s): + # , config, chan_id): + """ + Load and reshape channel values (predicted and actual). + + Args: + config (obj): Config object containing parameters for processing + chan_id (str): channel id + + Attributes: + id (str): channel id + config (obj): see Args + X_train (arr): training inputs with dimensions + [timesteps, l_s, input dimensions) + X_test (arr): test inputs with dimensions + [timesteps, l_s, input dimensions) + y_train (arr): actual channel training values with dimensions + [timesteps, n_predictions, 1) + y_test (arr): actual channel test values with dimensions + [timesteps, n_predictions, 1) + train (arr): train data loaded from .npy file + test(arr): test data loaded from .npy file + """ + + # self.id = chan_id + # self.config = config + self.X_train = None + self.y_train = None + self.X_test = None + self.y_test = None + self.y_hat = None + self.train = None + self.test = None + + self._n_predictions = n_predictions + self._l_s = l_s + + def shape_train_data(self, arr): + # , train=True): + """Shape raw input streams for ingestion into LSTM. config.l_s specifies + the sequence length of prior timesteps fed into the model at + each timestep t. + + Args: + arr (np array): array of input streams with + dimensions [timesteps, 1, input dimensions] + train (bool): If shaping training data, this indicates + data can be shuffled + """ + # print("in shape data") + # print("arr shape",arr.shape) + # print("ls",self.config.l_s) + # print("n_pred",self.config.n_predictions) + data = [] + + for i in range(len(arr) - self._l_s - self._n_predictions): + data.append(arr[i:i + self._l_s + self._n_predictions]) + data = np.array(data) + # print("data shape",data.shape) + # assert len(data.shape) == 3 + + # if train: + # # np.random.shuffle(data) + # self.X_train = data[:, :-self.config.n_predictions, :] + # self.y_train = data[:, -self.config.n_predictions:, :] # telemetry value is at position 0 + # self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) + # print("X train shape",self.X_train .shape) + # print("Y train shape",self.y_train .shape) + # else: + + self.X_train = data[:, :-self._n_predictions, :] + self.y_train = data[:, -self._n_predictions:, :] # telemetry value is at position 0 + self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) + + + + + def shape_test_data(self, arr): + data = [] + + for i in range(len(arr) - self._l_s - self._n_predictions): + data.append(arr[i:i + self._l_s + self._n_predictions]) + data = np.array(data) + # print("data shape",data.shape) + self.X_test = data[:, :-self._n_predictions, :] + self.y_test = data[:, -self._n_predictions:, :] # telemetry value is at position 0 + self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],self.y_test.shape[1]*self.y_test.shape[2])) + + + # def load_data(self): + # """ + # Load train and test data from local. + # """ + # # try: + # # self.train = np.load(os.path.join("data", "train", "{}.npy".format(self.id))) + # # self.test = np.load(os.path.join("data", "test", "{}.npy".format(self.id))) + + # # except FileNotFoundError as e: + # # # logger.critical(e) + # # # logger.critical("Source data not found, may need to add data to repo: ") + # # print("Source data not found, may need to add data to repo: ") + + # print("before shape function") + # print(self.train.shape) + # self.shape_data(self.train) + # self.shape_data(self.test, train=False) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/errors.py b/tods/detection_algorithm/core/utils/errors.py new file mode 100644 index 00000000..d3ee8ab3 --- /dev/null +++ b/tods/detection_algorithm/core/utils/errors.py @@ -0,0 +1,532 @@ +import numpy as np +import pandas as pd +import more_itertools as mit +import os +import logging + +logger = logging.getLogger('telemanom') + + +class Errors: + def __init__(self, channel, window_size,batch_size, smoothing_perc,n_predictions,l_s,error_buffer,p): + """ + Batch processing of errors between actual and predicted values + for a channel. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + config (obj): Config object containing parameters for processing + run_id (str): Datetime referencing set of predictions in use + + Attributes: + config (obj): see Args + window_size (int): number of trailing batches to use in error + calculation + n_windows (int): number of windows in test values for channel + i_anom (arr): indices of anomalies in channel test values + E_seq (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in test values + anom_scores (arr): score indicating relative severity of each + anomaly sequence in E_seq + e (arr): errors in prediction (predicted - actual) + e_s (arr): exponentially-smoothed errors in prediction + normalized (arr): prediction errors as a percentage of the range + of the channel values + """ + + # self.config = config + + + self._window_size =window_size + self._batch_size = batch_size + self._smoothing_perc = smoothing_perc + self._n_predictions = n_predictions + self._l_s = l_s + self._error_buffer = error_buffer + self._p = p + + + self.window_size = self._window_size + self.n_windows = int((channel.y_test.shape[0] - + (self._batch_size * self.window_size)) + / self._batch_size) + self.i_anom = np.array([]) + self.E_seq = [] + self.anom_scores = [] + channel.y_test = np.reshape(channel.y_test,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("*****************************") + # print("y_hat shape",channel.y_hat.shape) + # print("y_test shape",channel.y_test.shape) + + channel.y_hat = np.reshape(channel.y_hat, (channel.y_hat.shape[0]*channel.y_hat.shape[1]*channel.y_hat.shape[2])) + channel.y_test = np.reshape(channel.y_test, (channel.y_test.shape[0]*channel.y_test.shape[1]*channel.y_test.shape[2])) + + # print("after y_hat shape",channel.y_hat.shape) + # print(" after y_test shape",channel.y_test.shape) + + + # raw prediction error + self.e = [abs(y_h-y_t) for y_h, y_t in + zip(channel.y_hat, channel.y_test)] + + self.e = np.reshape(self.e,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("raw shape",self.e.shape) + + n_pred = self._n_predictions + n_feature = channel.X_test.shape[2] + + # Aggregation for point wise + # aggregated_error = np.zeros(n_feature*(len(self.e)+n_pred-1)) + # aggregated_error = np.reshape(aggregated_error,((len(self.e)+n_pred-1),n_feature)) + # # print(aggregated_error) + # for i in range(0,len(self.e)): + # for j in range(len(self.e[i])): + # aggregated_error[i+j]+= self.e[i][j] + + # for i in range(1, len(aggregated_error)+1): + # if i < n_pred: + # aggregated_error[i-1] /=i + # elif len(aggregated_error) - i+1 < n_pred: + # aggregated_error[i-1]/= len(aggregated_error) - i+1 + # else: + # aggregated_error[i-1] /=n_pred + + # Aggregation sequence wise + aggregated_error = [] + for i in range(0,len(self.e)): + aggregated_error.append(np.sum(self.e[i],axis=0)) + + aggregated_error = np.asarray(aggregated_error) + # print(aggregated_error.shape) + + smoothing_window = int(self._batch_size * self._window_size + * self._smoothing_perc) + if not len(channel.y_hat) == len(channel.y_test): + raise ValueError('len(y_hat) != len(y_test): {}, {}' + .format(len(channel.y_hat), len(channel.y_test))) + + # smoothed prediction error + self.e_s = pd.DataFrame(aggregated_error).ewm(span=smoothing_window)\ + .mean().values.flatten() + + # print("ES",self.e_s) + # print("ES",self.e_s.shape) + # for values at beginning < sequence length, just use avg + # if not channel.id == 'C-2': # anomaly occurs early in window + + # print("LHS",self.e_s[:self.config.l_s]) + # print("RHS",[np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s) + # b = [np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s + # print("RHS shape",len(b)) + # self.e_s[:self._l_s] = [np.mean(self.e_s[:self._l_s * 2])] * self._l_s + + # np.save(os.path.join('data', run_id, 'smoothed_errors', '{}.npy' + # .format(channel.id)), + # np.array(self.e_s)) + + self.normalized = np.mean(self.e / np.ptp(channel.y_test)) + # logger.info("normalized prediction error: {0:.2f}" + # .format(self.normalized)) + + def adjust_window_size(self, channel): # pragma: no cover + """ + Decrease the historical error window size (h) if number of test + values is limited. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + while self.n_windows < 0: + self.window_size -= 1 + self.n_windows = int((channel.y_test.shape[0] + - (self._batch_size * self.window_size)) + / self._batch_size) + if self.window_size == 1 and self.n_windows < 0: + raise ValueError('Batch_size ({}) larger than y_test (len={}). ' + 'Adjust in config.yaml.' + .format(self._batch_size, + channel.y_test.shape[0])) + + def merge_scores(self): # pragma: no cover + """ + If anomalous sequences from subsequent batches are adjacent they + will automatically be combined. This combines the scores for these + initial adjacent sequences (scores are calculated as each batch is + processed) where applicable. + """ + + merged_scores = [] + score_end_indices = [] + + for i, score in enumerate(self.anom_scores): + if not score['start_idx']-1 in score_end_indices: + merged_scores.append(score['score']) + score_end_indices.append(score['end_idx']) + + def process_batches(self, channel): # pragma: no cover + """ + Top-level function for the Error class that loops through batches + of values for a channel. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + self.adjust_window_size(channel) + + for i in range(0, self.n_windows+1): + prior_idx = i * self._batch_size + idx = (self._window_size * self._batch_size) \ + + (i * self._batch_size) + if i == self.n_windows: + idx = channel.y_test.shape[0] + + window = ErrorWindow(channel, prior_idx, idx, self, i,self._l_s,self._error_buffer,self._batch_size,self._p) + + window.find_epsilon() + window.find_epsilon(inverse=True) + + window.compare_to_epsilon(self) + window.compare_to_epsilon(self, inverse=True) + + if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: + continue + + window.prune_anoms() + window.prune_anoms(inverse=True) + + if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: + continue + + window.i_anom = np.sort(np.unique( + np.append(window.i_anom, window.i_anom_inv))).astype('int') + window.score_anomalies(prior_idx) + # print("window anom scores", window.anom_scores) + + # update indices to reflect true indices in full set of values + self.i_anom = np.append(self.i_anom, window.i_anom + prior_idx) + self.anom_scores = self.anom_scores + window.anom_scores + + if len(self.i_anom) > 0: + # group anomalous indices into continuous sequences + groups = [list(group) for group in + mit.consecutive_groups(self.i_anom)] + self.E_seq = [(int(g[0]), int(g[-1])) for g in groups + if not g[0] == g[-1]] + + # additional shift is applied to indices so that they represent the + # position in the original data array, obtained from the .npy files, + # and not the position on y_test (See PR #27). + self.E_seq = [(e_seq[0] + self._l_s, + e_seq[1] + self._l_s) for e_seq in self.E_seq] + + self.merge_scores() + + +class ErrorWindow: # pragma: no cover + def __init__(self, channel,start_idx, end_idx, errors, window_num,l_s,error_buffer,batch_size,p): + """ + Data and calculations for a specific window of prediction errors. + Includes finding thresholds, pruning, and scoring anomalous sequences + for errors and inverted errors (flipped around mean) - significant drops + in values can also be anomalous. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + config (obj): Config object containing parameters for processing + start_idx (int): Starting index for window within full set of + channel test values + end_idx (int): Ending index for window within full set of channel + test values + errors (arr): Errors class object + window_num (int): Current window number within channel test values + + Attributes: + i_anom (arr): indices of anomalies in window + i_anom_inv (arr): indices of anomalies in window of inverted + telemetry values + E_seq (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in window + E_seq_inv (arr of tuples): array of (start, end) indices for each + continuous anomaly sequence in window of inverted telemetry + values + non_anom_max (float): highest smoothed error value below epsilon + non_anom_max_inv (float): highest smoothed error value below + epsilon_inv + config (obj): see Args + anom_scores (arr): score indicating relative severity of each + anomaly sequence in E_seq within a window + window_num (int): see Args + sd_lim (int): default number of standard deviations to use for + threshold if no winner or too many anomalous ranges when scoring + candidate thresholds + sd_threshold (float): number of standard deviations for calculation + of best anomaly threshold + sd_threshold_inv (float): same as above for inverted channel values + e_s (arr): exponentially-smoothed prediction errors in window + e_s_inv (arr): inverted e_s + sd_e_s (float): standard deviation of e_s + mean_e_s (float): mean of e_s + epsilon (float): threshold for e_s above which an error is + considered anomalous + epsilon_inv (float): threshold for inverted e_s above which an error + is considered anomalous + y_test (arr): Actual telemetry values for window + sd_values (float): st dev of y_test + perc_high (float): the 95th percentile of y_test values + perc_low (float): the 5th percentile of y_test values + inter_range (float): the range between perc_high - perc_low + num_to_ignore (int): number of values to ignore initially when + looking for anomalies + """ + + self._l_s = l_s + self._error_buffer = error_buffer + self._batch_size = batch_size + self._p = p + + + self.i_anom = np.array([]) + self.E_seq = np.array([]) + self.non_anom_max = -1000000 + self.i_anom_inv = np.array([]) + self.E_seq_inv = np.array([]) + self.non_anom_max_inv = -1000000 + + # self.config = config + self.anom_scores = [] + + self.window_num = window_num + + self.sd_lim = 12.0 + self.sd_threshold = self.sd_lim + self.sd_threshold_inv = self.sd_lim + + self.e_s = errors.e_s[start_idx:end_idx] + + self.mean_e_s = np.mean(self.e_s) + self.sd_e_s = np.std(self.e_s) + self.e_s_inv = np.array([self.mean_e_s + (self.mean_e_s - e) + for e in self.e_s]) + + self.epsilon = self.mean_e_s + self.sd_lim * self.sd_e_s + self.epsilon_inv = self.mean_e_s + self.sd_lim * self.sd_e_s + + self.y_test = channel.y_test[start_idx:end_idx] + self.sd_values = np.std(self.y_test) + + self.perc_high, self.perc_low = np.percentile(self.y_test, [95, 5]) + self.inter_range = self.perc_high - self.perc_low + + # ignore initial error values until enough history for processing + self.num_to_ignore = self._l_s * 2 + # if y_test is small, ignore fewer + if len(channel.y_test) < 2500: + self.num_to_ignore = self._l_s + if len(channel.y_test) < 1800: + self.num_to_ignore = 0 + + def find_epsilon(self, inverse=False): + """ + Find the anomaly threshold that maximizes function representing + tradeoff between: + a) number of anomalies and anomalous ranges + b) the reduction in mean and st dev if anomalous points are removed + from errors + (see https://arxiv.org/pdf/1802.04431.pdf) + + Args: + inverse (bool): If true, epsilon is calculated for inverted errors + """ + e_s = self.e_s if not inverse else self.e_s_inv + + max_score = -10000000 + + for z in np.arange(2.5, self.sd_lim, 0.5): + epsilon = self.mean_e_s + (self.sd_e_s * z) + + pruned_e_s = e_s[e_s < epsilon] + + i_anom = np.argwhere(e_s >= epsilon).reshape(-1,) + buffer = np.arange(1, self._error_buffer) + i_anom = np.sort(np.concatenate((i_anom, + np.array([i+buffer for i in i_anom]) + .flatten(), + np.array([i-buffer for i in i_anom]) + .flatten()))) + i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] + i_anom = np.sort(np.unique(i_anom)) + + if len(i_anom) > 0: + # group anomalous indices into continuous sequences + groups = [list(group) for group + in mit.consecutive_groups(i_anom)] + E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] + + mean_perc_decrease = (self.mean_e_s - np.mean(pruned_e_s)) \ + / self.mean_e_s + sd_perc_decrease = (self.sd_e_s - np.std(pruned_e_s)) \ + / self.sd_e_s + score = (mean_perc_decrease + sd_perc_decrease) \ + / (len(E_seq) ** 2 + len(i_anom)) + + # sanity checks / guardrails + if score >= max_score and len(E_seq) <= 5 and \ + len(i_anom) < (len(e_s) * 0.5): + max_score = score + if not inverse: + self.sd_threshold = z + self.epsilon = self.mean_e_s + z * self.sd_e_s + else: + self.sd_threshold_inv = z + self.epsilon_inv = self.mean_e_s + z * self.sd_e_s + + def compare_to_epsilon(self, errors_all, inverse=False): + """ + Compare smoothed error values to epsilon (error threshold) and group + consecutive errors together into sequences. + + Args: + errors_all (obj): Errors class object containing list of all + previously identified anomalies in test set + """ + + e_s = self.e_s if not inverse else self.e_s_inv + epsilon = self.epsilon if not inverse else self.epsilon_inv + + # Check: scale of errors compared to values too small? + if not (self.sd_e_s > (.05 * self.sd_values) or max(self.e_s) + > (.05 * self.inter_range)) or not max(self.e_s) > 0.05: + return + + i_anom = np.argwhere((e_s >= epsilon) & + (e_s > 0.05 * self.inter_range)).reshape(-1,) + + if len(i_anom) == 0: + return + buffer = np.arange(1, self._error_buffer+1) + i_anom = np.sort(np.concatenate((i_anom, + np.array([i + buffer for i in i_anom]) + .flatten(), + np.array([i - buffer for i in i_anom]) + .flatten()))) + i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] + + # if it is first window, ignore initial errors (need some history) + if self.window_num == 0: + i_anom = i_anom[i_anom >= self.num_to_ignore] + else: + i_anom = i_anom[i_anom >= len(e_s) - self._batch_size] + + i_anom = np.sort(np.unique(i_anom)) + + # capture max of non-anomalous values below the threshold + # (used in filtering process) + batch_position = self.window_num * self._batch_size + window_indices = np.arange(0, len(e_s)) + batch_position + adj_i_anom = i_anom + batch_position + window_indices = np.setdiff1d(window_indices, + np.append(errors_all.i_anom, adj_i_anom)) + candidate_indices = np.unique(window_indices - batch_position) + non_anom_max = np.max(np.take(e_s, candidate_indices)) + + # group anomalous indices into continuous sequences + groups = [list(group) for group in mit.consecutive_groups(i_anom)] + E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] + + if inverse: + self.i_anom_inv = i_anom + self.E_seq_inv = E_seq + self.non_anom_max_inv = non_anom_max + else: + self.i_anom = i_anom + self.E_seq = E_seq + self.non_anom_max = non_anom_max + + def prune_anoms(self, inverse=False): + """ + Remove anomalies that don't meet minimum separation from the next + closest anomaly or error value + + Args: + inverse (bool): If true, epsilon is calculated for inverted errors + """ + + E_seq = self.E_seq if not inverse else self.E_seq_inv + e_s = self.e_s if not inverse else self.e_s_inv + non_anom_max = self.non_anom_max if not inverse \ + else self.non_anom_max_inv + + if len(E_seq) == 0: + return + + E_seq_max = np.array([max(e_s[e[0]:e[1]+1]) for e in E_seq]) + E_seq_max_sorted = np.sort(E_seq_max)[::-1] + E_seq_max_sorted = np.append(E_seq_max_sorted, [non_anom_max]) + + i_to_remove = np.array([]) + for i in range(0, len(E_seq_max_sorted)-1): + if (E_seq_max_sorted[i] - E_seq_max_sorted[i+1]) \ + / E_seq_max_sorted[i] < self._p: + i_to_remove = np.append(i_to_remove, np.argwhere( + E_seq_max == E_seq_max_sorted[i])) + else: + i_to_remove = np.array([]) + i_to_remove[::-1].sort() + + if len(i_to_remove) > 0: + E_seq = np.delete(E_seq, i_to_remove, axis=0) + + if len(E_seq) == 0 and inverse: + self.i_anom_inv = np.array([]) + return + elif len(E_seq) == 0 and not inverse: + self.i_anom = np.array([]) + return + + indices_to_keep = np.concatenate([range(e_seq[0], e_seq[-1]+1) + for e_seq in E_seq]) + + if not inverse: + mask = np.isin(self.i_anom, indices_to_keep) + self.i_anom = self.i_anom[mask] + else: + mask_inv = np.isin(self.i_anom_inv, indices_to_keep) + self.i_anom_inv = self.i_anom_inv[mask_inv] + + def score_anomalies(self, prior_idx): + """ + Calculate anomaly scores based on max distance from epsilon + for each anomalous sequence. + + Args: + prior_idx (int): starting index of window within full set of test + values for channel + """ + + groups = [list(group) for group in mit.consecutive_groups(self.i_anom)] + + for e_seq in groups: + + score_dict = { + "start_idx": e_seq[0] + prior_idx, + "end_idx": e_seq[-1] + prior_idx, + "score": 0 + } + + score = max([abs(self.e_s[i] - self.epsilon) + / (self.mean_e_s + self.sd_e_s) for i in + range(e_seq[0], e_seq[-1] + 1)]) + inv_score = max([abs(self.e_s_inv[i] - self.epsilon_inv) + / (self.mean_e_s + self.sd_e_s) for i in + range(e_seq[0], e_seq[-1] + 1)]) + + # the max score indicates whether anomaly was from regular + # or inverted errors + score_dict['score'] = max([score, inv_score]) + self.anom_scores.append(score_dict) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/modeling.py b/tods/detection_algorithm/core/utils/modeling.py new file mode 100644 index 00000000..6fb36510 --- /dev/null +++ b/tods/detection_algorithm/core/utils/modeling.py @@ -0,0 +1,206 @@ +from tensorflow.keras.models import Sequential, load_model +from tensorflow.keras.callbacks import History, EarlyStopping, Callback +from tensorflow.keras.layers import LSTM +from tensorflow.keras.layers import Dense, Activation, Dropout +from tensorflow.keras.layers import Flatten +import numpy as np +import os +import logging + +# suppress tensorflow CPU speedup warnings +os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' +logger = logging.getLogger('telemanom') + + +class Model: + def __init__(self, channel,patience,min_delta,layers,dropout,n_predictions,loss_metric, + optimizer,lstm_batch_size,epochs,validation_split,batch_size,l_s + ): + """ + Loads/trains RNN and predicts future telemetry values for a channel. + + Args: + config (obj): Config object containing parameters for processing + and model training + run_id (str): Datetime referencing set of predictions in use + channel (obj): Channel class object containing train/test data + for X,y for a single channel + + Attributes: + config (obj): see Args + chan_id (str): channel id + run_id (str): see Args + y_hat (arr): predicted channel values + model (obj): trained RNN model for predicting channel values + """ + + # self.config = config + # self.chan_id = channel.id + # self.run_id = run_id + self.y_hat = np.array([]) + self.model = None + + # self.save() + + self._patience = patience + self._min_delta = min_delta + self._layers = layers + self._dropout = dropout + self._n_predictions = n_predictions + self._loss_metric = loss_metric + self._optimizer = optimizer + self._lstm_batch_size = lstm_batch_size + self._epochs = epochs + self._validation_split = validation_split + self._batch_size = batch_size + self._l_s = l_s + + self.train_new(channel) + + + # def load(self): + # """ + # Load model for channel. + # """ + + # logger.info('Loading pre-trained model') + # self.model = load_model(os.path.join('data', self.config.use_id, + # 'models', self.chan_id + '.h5')) + + def train_new(self, channel): + """ + Train LSTM model according to specifications in config.yaml. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + """ + + cbs = [History(), EarlyStopping(monitor='val_loss', + patience=self._patience, + min_delta=self._min_delta, + verbose=1)] + + self.model = Sequential() + + self.model.add(LSTM( + self._layers[0], + input_shape=(None, channel.X_train.shape[2]), + return_sequences=True)) + self.model.add(Dropout(self._dropout)) + + self.model.add(LSTM( + self._layers[1], + return_sequences=False)) + self.model.add(Dropout(self._dropout)) + + self.model.add(Dense( + self._n_predictions + *channel.X_train.shape[2] + )) + self.model.add(Activation('linear')) + + self.model.compile(loss=self._loss_metric, + optimizer=self._optimizer) + + + # print(self.model.summary()) + + self.model.fit(channel.X_train, + channel.y_train, + batch_size=self._lstm_batch_size, + epochs=self._epochs, + shuffle=False, + validation_split=self._validation_split, + callbacks=cbs, + verbose=True) + + + + # def save(self): + # """ + # Save trained model. + # """ + + # self.model.save(os.path.join('data', self.run_id, 'models', + # '{}.h5'.format(self.chan_id))) + + def aggregate_predictions(self, y_hat_batch, method='mean'): # pragma: no cover + """ + Aggregates predictions for each timestep. When predicting n steps + ahead where n > 1, will end up with multiple predictions for a + timestep. + + Args: + y_hat_batch (arr): predictions shape (, = 0 else 0 + + # predictions pertaining to a specific timestep lie along diagonal + y_hat_t = np.flipud(y_hat_batch[start_idx:t+1]).diagonal() + + if method == 'first': + agg_y_hat_batch = np.append(agg_y_hat_batch, [y_hat_t[0]]) + elif method == 'mean': + agg_y_hat_batch = np.append(agg_y_hat_batch, np.mean(y_hat_t)) + + agg_y_hat_batch = agg_y_hat_batch.reshape(len(agg_y_hat_batch), 1) + self.y_hat = np.append(self.y_hat, agg_y_hat_batch) + + + + def batch_predict(self, channel): + """ + Used trained LSTM model to predict test data arriving in batches. + + Args: + channel (obj): Channel class object containing train/test data + for X,y for a single channel + + Returns: + channel (obj): Channel class object with y_hat values as attribute + """ + + # num_batches = int((y_test.shape[0] - self._l_s) + # / self._batch_size) + # if num_batches < 0: + # raise ValueError('l_s ({}) too large for stream length {}.' + # .format(self._l_s, y_test.shape[0])) + + # # simulate data arriving in batches, predict each batch + # for i in range(0, num_batches + 1): + # prior_idx = i * self._batch_size + # idx = (i + 1) * self._batch_size + + # if i + 1 == num_batches + 1: + # # remaining values won't necessarily equal batch size + # idx = y_test.shape[0] + + # X_test_batch = X_test[prior_idx:idx] + # y_hat_batch = self.model.predict(X_test_batch) + # y_hat_batch = np.reshape(y_hat_batch,(X_test.shape[0],self._n_predictions,X_test.shape[2])) + # # print("PREDICTIONS",y_hat_batch.shape) + # self.aggregate_predictions(y_hat_batch) + + # self.y_hat = np.reshape(self.y_hat, (self.y_hat.size,)) + + # channel.y_hat = self.y_hat + + # # np.save(os.path.join('data', self.run_id, 'y_hat', '{}.npy' + # # .format(self.chan_id)), self.y_hat) + + # return channel + + self.y_hat = self.model.predict(channel.X_test) + self.y_hat = np.reshape(self.y_hat,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) + # print("shape before ",self.y_hat.shape) + channel.y_hat = self.y_hat + return channel diff --git a/tods/detection_algorithm/core/utils/utils.py b/tods/detection_algorithm/core/utils/utils.py new file mode 100644 index 00000000..e69de29b