From 7ddd5758b5921d6122307e741d995d85b8b30246 Mon Sep 17 00:00:00 2001 From: LSC2204 Date: Tue, 31 Jan 2023 12:57:40 +0000 Subject: [PATCH] update .gitignore --- .gitignore | 3 + .vscode/launch.json | 16 - docs/tods-doc-sphinx | 1 - .../axolotl_interface/run_advanced_search.py | 70 ++- .../axolotl_interface/run_pipeline_system.py | 8 +- out.csv | 64 --- tods-doc-sphinx | 1 - tods/detection_algorithm/core/AutoRegOD.py | 198 ------- .../core/CollectiveBase.py | 476 ---------------- .../core/CollectiveCommonTest.py | 174 ------ tods/detection_algorithm/core/KDiscord.py | 266 --------- tods/detection_algorithm/core/LSTMOD.py | 266 --------- .../core/MultiAutoRegOD.py | 238 -------- tods/detection_algorithm/core/PCA.py | 264 --------- .../detection_algorithm/core/SODCommonTest.py | 154 ----- .../detection_algorithm/core/UODCommonTest.py | 153 ----- tods/detection_algorithm/core/__init__.py | 0 .../core/algorithm_implementation.py | 0 .../core/dagmm/__init__.py | 6 - .../core/dagmm/compression_net.py | 121 ---- tods/detection_algorithm/core/dagmm/dagmm.py | 251 --------- .../core/dagmm/estimation_net.py | 63 --- tods/detection_algorithm/core/dagmm/gmm.py | 130 ----- .../core/test_CollectiveBase.py | 211 ------- tods/detection_algorithm/core/utility.py | 179 ------ .../core/utils/__init__.py | 0 .../detection_algorithm/core/utils/channel.py | 114 ---- tods/detection_algorithm/core/utils/errors.py | 532 ------------------ .../core/utils/modeling.py | 206 ------- tods/detection_algorithm/core/utils/utils.py | 0 tods/searcher/out.csv | 4 - ...detection_algorithm_test_search_space.json | 78 +++ .../search_space/example_search_space.json | 66 +++ .../feature_analysis_test_search_space.json | 17 + tods/searcher/search_space/test.json | 33 ++ ...meseries_processing_test_search_space.json | 88 +++ tods/searcher/searcher.py | 265 --------- tods/tests/searcher/test_pipeline.py | 279 +++++++++ tods/tests/searcher/test_searcher.py | 5 +- 39 files changed, 636 insertions(+), 4364 deletions(-) delete mode 100644 .vscode/launch.json delete mode 160000 docs/tods-doc-sphinx delete mode 100644 out.csv delete mode 160000 tods-doc-sphinx delete mode 100644 tods/detection_algorithm/core/AutoRegOD.py delete mode 100644 tods/detection_algorithm/core/CollectiveBase.py delete mode 100755 tods/detection_algorithm/core/CollectiveCommonTest.py delete mode 100644 tods/detection_algorithm/core/KDiscord.py delete mode 100644 tods/detection_algorithm/core/LSTMOD.py delete mode 100644 tods/detection_algorithm/core/MultiAutoRegOD.py delete mode 100644 tods/detection_algorithm/core/PCA.py delete mode 100755 tods/detection_algorithm/core/SODCommonTest.py delete mode 100755 tods/detection_algorithm/core/UODCommonTest.py delete mode 100644 tods/detection_algorithm/core/__init__.py delete mode 100644 tods/detection_algorithm/core/algorithm_implementation.py delete mode 100644 tods/detection_algorithm/core/dagmm/__init__.py delete mode 100644 tods/detection_algorithm/core/dagmm/compression_net.py delete mode 100644 tods/detection_algorithm/core/dagmm/dagmm.py delete mode 100644 tods/detection_algorithm/core/dagmm/estimation_net.py delete mode 100644 tods/detection_algorithm/core/dagmm/gmm.py delete mode 100644 tods/detection_algorithm/core/test_CollectiveBase.py delete mode 100644 tods/detection_algorithm/core/utility.py delete mode 100644 tods/detection_algorithm/core/utils/__init__.py delete mode 100644 tods/detection_algorithm/core/utils/channel.py delete mode 100644 tods/detection_algorithm/core/utils/errors.py delete mode 100644 tods/detection_algorithm/core/utils/modeling.py delete mode 100644 tods/detection_algorithm/core/utils/utils.py delete mode 100644 tods/searcher/out.csv create mode 100644 tods/searcher/search_space/detection_algorithm_test_search_space.json create mode 100644 tods/searcher/search_space/example_search_space.json create mode 100644 tods/searcher/search_space/feature_analysis_test_search_space.json create mode 100644 tods/searcher/search_space/test.json create mode 100644 tods/searcher/search_space/timeseries_processing_test_search_space.json diff --git a/.gitignore b/.gitignore index 773adebf..2bf27a43 100644 --- a/.gitignore +++ b/.gitignore @@ -120,3 +120,6 @@ example_pipeline.json tmp.txt benchmark/realworld_data/data/*.csv + +.vscode/ +tods-doc-sphinx/ diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 95c55cf9..00000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - // 使用 IntelliSense 了解相关属性。 - // 悬停以查看现有属性的描述。 - // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal", - "justMyCode": true - } - ] -} \ No newline at end of file diff --git a/docs/tods-doc-sphinx b/docs/tods-doc-sphinx deleted file mode 160000 index e6a0d574..00000000 --- a/docs/tods-doc-sphinx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit e6a0d5743f8360966e955859aab70443d9aac178 diff --git a/examples/axolotl_interface/run_advanced_search.py b/examples/axolotl_interface/run_advanced_search.py index 3039d9f6..dc6f1ac7 100644 --- a/examples/axolotl_interface/run_advanced_search.py +++ b/examples/axolotl_interface/run_advanced_search.py @@ -11,7 +11,7 @@ #target_index = 2 # what column is the target table_path = '../../datasets/anomaly/raw_data/yahoo_sub_5.csv' -search_space_path = "../../tods/searcher/example_search_space.json" + target_index = 6 # what column is the target #table_path = 'datasets/NAB/realTweets/labeled_Twitter_volume_IBM.csv' # The path of the dataset time_limit = 30 # How many seconds you wanna search @@ -33,8 +33,72 @@ beta=1.0) # get JSON search space -with open(search_space_path) as f: - search_space= json.load(f) +search_space = { + "timeseries_processing": { + "time_series_seasonality_trend_decomposition": { + "use_semantic_types": [ + 1, + 0 + ] + }, + "moving_average_transform":{ + "window_size":[ + 3, + 4, + 5 + ], + "norm":[ + "l1", + "l2", + "max" + ], + "use_semantic_types":[ + 0, + 1 + ] + } + }, + "feature_analysis": { + "statistical_h_mean": { + "window_size": [ + 10, + 20 + ] + }, + "statistical_maximum": { + "window_size": [ + 10, + 20 + ] + }, + "statistical_minimum": { + "window_size": [ + 10, + 20 + ] + } + }, + "detection_algorithm": { + "pyod_ae": { + "dropout_rate": [ + 0.1, + 0.2 + ] + }, + "pyod_loda": { + "n_bins": [ + 10, + 20 + ] + }, + "pyod_cof": { + "n_neighborss": [ + 15, + 20 + ] + } + } +} #define search process config = { diff --git a/examples/axolotl_interface/run_pipeline_system.py b/examples/axolotl_interface/run_pipeline_system.py index e85d8a6c..6c396106 100644 --- a/examples/axolotl_interface/run_pipeline_system.py +++ b/examples/axolotl_interface/run_pipeline_system.py @@ -44,13 +44,11 @@ 'feature_analysis': [ ('statistical_maximum',), - ], # Specify hyperparams as k,v pairs - - # 'timeseries_processing':[ - # ('standard_scaler',) - # ], + ] } pipeline = build_system_pipeline(config) +print(pipeline.to_json()) +input() # Run the pipeline pipeline_result = evaluate_pipeline(dataset, pipeline, metric) diff --git a/out.csv b/out.csv deleted file mode 100644 index c84b2594..00000000 --- a/out.csv +++ /dev/null @@ -1,64 +0,0 @@ -,F_beta,RECALL,PRECISION,F1,F1_MACRO,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,trial_id,experiment_tag,config/detection_algorithm,config/feature_analysis,config/timeseries_processing,logdir -0,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.637336492538452,False,,,1,97547e98882f4bb7bff02cdc0c39a883,2022-12-10_13-39-13,1670679553,5.637336492538452,35978,wLLGv5,172.17.0.6,5.637336492538452,0,1,05589_00000,"0_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],[['statistical_h_mean']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00000_0_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time__2022-12-10_13-39-07" -1,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5033113956451416,False,,,1,01ac6dd12a864006acf5485e1a2189cc,2022-12-10_13-39-14,1670679554,0.5033113956451416,35978,wLLGv5,172.17.0.6,0.5033113956451416,0,1,05589_00001,"1_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],[['statistical_h_mean']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00001_1_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['tim_2022-12-10_13-39-13" -2,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4011819362640381,False,,,1,11ff661c769144ab921d26b1c39537dc,2022-12-10_13-39-14,1670679554,0.4011819362640381,35978,wLLGv5,172.17.0.6,0.4011819362640381,0,1,05589_00002,"2_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],[['statistical_h_mean']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00002_2_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_2022-12-10_13-39-14" -3,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.454044342041016,False,,,1,b036fcc03b6b4091acf7778bbd36515f,2022-12-10_13-39-20,1670679560,5.454044342041016,35978,wLLGv5,172.17.0.6,5.454044342041016,0,1,05589_00003,"3_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00003_3_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],times_2022-12-10_13-39-14" -4,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.6077921390533447,False,,,1,37c17031762e485383f26b3d7dbe2d79,2022-12-10_13-39-20,1670679560,0.6077921390533447,35978,wLLGv5,172.17.0.6,0.6077921390533447,0,1,05589_00004,"4_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00004_4_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],tim_2022-12-10_13-39-20" -5,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4982719421386719,False,,,1,edf2c722cb724212b447664cf628a590,2022-12-10_13-39-21,1670679561,0.4982719421386719,35978,wLLGv5,172.17.0.6,0.4982719421386719,0,1,05589_00005,"5_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00005_5_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],time_2022-12-10_13-39-20" -6,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.534754753112793,False,,,1,0c6087b45bd34012a925c1e10920939d,2022-12-10_13-39-26,1670679566,5.534754753112793,35978,wLLGv5,172.17.0.6,5.534754753112793,0,1,05589_00006,"6_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00006_6_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['sta_2022-12-10_13-39-21" -7,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,1.3592181205749512,False,,,1,f5ac927c7e364755983b430df1d78294,2022-12-10_13-39-28,1670679568,1.3592181205749512,35978,wLLGv5,172.17.0.6,1.3592181205749512,0,1,05589_00007,"7_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00007_7_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['s_2022-12-10_13-39-26" -8,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.6394298076629639,False,,,1,ec678bacdff444c0ba4395d65fdd4b48,2022-12-10_13-39-28,1670679568,0.6394298076629639,35978,wLLGv5,172.17.0.6,0.6394298076629639,0,1,05589_00008,"8_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00008_8_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['st_2022-12-10_13-39-28" -9,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.555160045623779,False,,,1,787402d8505c4d048f1e3d0e909fb933,2022-12-10_13-39-34,1670679574,5.555160045623779,35978,wLLGv5,172.17.0.6,5.555160045623779,0,1,05589_00009,"9_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00009_9_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],times_2022-12-10_13-39-28" -10,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5893311500549316,False,,,1,8137b5b3b4bc4601a2847a028eed2e2e,2022-12-10_13-39-35,1670679575,0.5893311500549316,35978,wLLGv5,172.17.0.6,0.5893311500549316,0,1,05589_00010,"10_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00010_10_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],ti_2022-12-10_13-39-34" -11,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5052144527435303,False,,,1,a04b7df2c4e7415586cc4d418556ba2e,2022-12-10_13-39-35,1670679575,0.5052144527435303,35978,wLLGv5,172.17.0.6,0.5052144527435303,0,1,05589_00011,"11_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00011_11_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],tim_2022-12-10_13-39-35" -12,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.2729315757751465,False,,,1,49febb8c29d647f7be4ceeca278c5a22,2022-12-10_13-39-41,1670679581,5.2729315757751465,35978,wLLGv5,172.17.0.6,5.2729315757751465,0,1,05589_00012,"12_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],[['statistical_maximum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00012_12_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['tim_2022-12-10_13-39-35" -13,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4817383289337158,False,,,1,6cd2c1cd413a4dbcb7b94b2fc4cdaaeb,2022-12-10_13-39-41,1670679581,0.4817383289337158,35978,wLLGv5,172.17.0.6,0.4817383289337158,0,1,05589_00013,"13_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],[['statistical_maximum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00013_13_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['t_2022-12-10_13-39-41" -14,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.556443452835083,False,,,1,73ba12f6ce794e42b1a4221db38d2008,2022-12-10_13-39-42,1670679582,0.556443452835083,35978,wLLGv5,172.17.0.6,0.556443452835083,0,1,05589_00014,"14_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],[['statistical_maximum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00014_14_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['ti_2022-12-10_13-39-41" -15,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.905134201049805,False,,,1,361b67e538324c109af0e8c196db36fb,2022-12-10_13-39-48,1670679588,5.905134201049805,35978,wLLGv5,172.17.0.6,5.905134201049805,0,1,05589_00015,"15_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],"[['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00015_15_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],tim_2022-12-10_13-39-42" -16,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5968108177185059,False,,,1,154b56585f72430eba5a6ee5337cce01,2022-12-10_13-39-48,1670679588,0.5968108177185059,35978,wLLGv5,172.17.0.6,0.5968108177185059,0,1,05589_00016,"16_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],"[['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00016_16_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],t_2022-12-10_13-39-48" -17,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4951808452606201,False,,,1,6b138abdd21243538db03a76b43e907b,2022-12-10_13-39-49,1670679589,0.4951808452606201,35978,wLLGv5,172.17.0.6,0.4951808452606201,0,1,05589_00017,"17_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],"[['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00017_17_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],ti_2022-12-10_13-39-48" -18,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.267411947250366,False,,,1,8d7c3106a72e4380ada18e1046c7776e,2022-12-10_13-39-54,1670679594,5.267411947250366,35978,wLLGv5,172.17.0.6,5.267411947250366,0,1,05589_00018,"18_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_ae']],[['statistical_minimum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00018_18_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['tim_2022-12-10_13-39-49" -19,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5053033828735352,False,,,1,a20e7259243f42da943c5d596701217e,2022-12-10_13-39-55,1670679595,0.5053033828735352,35978,wLLGv5,172.17.0.6,0.5053033828735352,0,1,05589_00019,"19_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],[['statistical_minimum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00019_19_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['t_2022-12-10_13-39-54" -20,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4227888584136963,False,,,1,d0d7b9c9c7c8450ab69cf16a87c7b7d2,2022-12-10_13-39-55,1670679595,0.4227888584136963,35978,wLLGv5,172.17.0.6,0.4227888584136963,0,1,05589_00020,"20_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_cof']],[['statistical_minimum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00020_20_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['ti_2022-12-10_13-39-55" -21,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.713510036468506,False,,,1,ccd3d3be1f2843fb984ac4bb9cd6154d,2022-12-10_13-40-01,1670679601,5.713510036468506,35978,wLLGv5,172.17.0.6,5.713510036468506,0,1,05589_00021,"21_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],[['statistical_h_mean']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00021_21_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_2022-12-10_13-39-55" -22,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5926563739776611,False,,,1,598f3d35fd044958aac63638010f3684,2022-12-10_13-40-02,1670679602,0.5926563739776611,35978,wLLGv5,172.17.0.6,0.5926563739776611,0,1,05589_00022,"22_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],[['statistical_h_mean']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00022_22_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['ti_2022-12-10_13-40-01" -23,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5090138912200928,False,,,1,4a679ef8c1ae4014b07c37e7cf1f7d87,2022-12-10_13-40-02,1670679602,0.5090138912200928,35978,wLLGv5,172.17.0.6,0.5090138912200928,0,1,05589_00023,"23_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],[['statistical_h_mean']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00023_23_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['tim_2022-12-10_13-40-02" -24,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.484833240509033,False,,,1,1c10b469b1b940498e89eb27a30a4591,2022-12-10_13-40-08,1670679608,5.484833240509033,35978,wLLGv5,172.17.0.6,5.484833240509033,0,1,05589_00024,"24_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00024_24_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],time_2022-12-10_13-40-02" -25,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.7122187614440918,False,,,1,11962312975446609461409297b9dbc1,2022-12-10_13-40-09,1670679609,0.7122187614440918,35978,wLLGv5,172.17.0.6,0.7122187614440918,0,1,05589_00025,"25_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00025_25_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],ti_2022-12-10_13-40-08" -26,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.6470978260040283,False,,,1,39939f582b2942d1b2e08bff71904b09,2022-12-10_13-40-09,1670679609,0.6470978260040283,35978,wLLGv5,172.17.0.6,0.6470978260040283,0,1,05589_00026,"26_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00026_26_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],tim_2022-12-10_13-40-09" -27,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.962493658065796,False,,,1,ec0879af9b8447bba3f1010c806332b7,2022-12-10_13-40-15,1670679615,5.962493658065796,35978,wLLGv5,172.17.0.6,5.962493658065796,0,1,05589_00027,"27_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00027_27_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['st_2022-12-10_13-40-09" -28,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,1.0065770149230957,False,,,1,5d43c73e0e35438dbaa64aadbfa1f961,2022-12-10_13-40-17,1670679617,1.0065770149230957,35978,wLLGv5,172.17.0.6,1.0065770149230957,0,1,05589_00028,"28_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00028_28_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['_2022-12-10_13-40-15" -29,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,1.3891470432281494,False,,,1,90e9210e23e5477bb4424cb2032b637a,2022-12-10_13-40-18,1670679618,1.3891470432281494,35978,wLLGv5,172.17.0.6,1.3891470432281494,0,1,05589_00029,"29_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00029_29_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['s_2022-12-10_13-40-17" -30,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.6175572872161865,False,,,1,433ee7d584a649e2bd2f54ed10dbd993,2022-12-10_13-40-24,1670679624,5.6175572872161865,35978,wLLGv5,172.17.0.6,5.6175572872161865,0,1,05589_00030,"30_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00030_30_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],time_2022-12-10_13-40-18" -31,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.6850948333740234,False,,,1,5929e8f6b7b742808d51c949f954ce66,2022-12-10_13-40-24,1670679624,0.6850948333740234,35978,wLLGv5,172.17.0.6,0.6850948333740234,0,1,05589_00031,"31_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00031_31_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],ti_2022-12-10_13-40-24" -32,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5883927345275879,False,,,1,f459b5c2e9f44a46923fe4b4c74a111e,2022-12-10_13-40-25,1670679625,0.5883927345275879,35978,wLLGv5,172.17.0.6,0.5883927345275879,0,1,05589_00032,"32_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00032_32_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],tim_2022-12-10_13-40-24" -33,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.192371845245361,False,,,1,1a677006f0df472199c6bddb7c09d126,2022-12-10_13-40-30,1670679630,5.192371845245361,35978,wLLGv5,172.17.0.6,5.192371845245361,0,1,05589_00033,"33_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],[['statistical_maximum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00033_33_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['tim_2022-12-10_13-40-25" -34,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.582158088684082,False,,,1,f292b51adb4b46d6af431685c469aa76,2022-12-10_13-40-31,1670679631,0.582158088684082,35978,wLLGv5,172.17.0.6,0.582158088684082,0,1,05589_00034,"34_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],[['statistical_maximum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00034_34_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['t_2022-12-10_13-40-30" -35,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5006301403045654,False,,,1,959e0810a957416b9cbf4fcd9d060b6d,2022-12-10_13-40-32,1670679632,0.5006301403045654,35978,wLLGv5,172.17.0.6,0.5006301403045654,0,1,05589_00035,"35_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],[['statistical_maximum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00035_35_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['ti_2022-12-10_13-40-31" -36,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.738236904144287,False,,,1,f4084891e3284150a70a8d90a770c54b,2022-12-10_13-40-37,1670679637,5.738236904144287,35978,wLLGv5,172.17.0.6,5.738236904144287,0,1,05589_00036,"36_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],"[['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00036_36_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],tim_2022-12-10_13-40-32" -37,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.7746410369873047,False,,,1,2b9e241afac24058984c9d9cb4f4aa4b,2022-12-10_13-40-38,1670679638,0.7746410369873047,35978,wLLGv5,172.17.0.6,0.7746410369873047,0,1,05589_00037,"37_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],"[['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00037_37_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],t_2022-12-10_13-40-37" -38,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5962550640106201,False,,,1,2776d8f80d204dafb99e46fafad4753d,2022-12-10_13-40-39,1670679639,0.5962550640106201,35978,wLLGv5,172.17.0.6,0.5962550640106201,0,1,05589_00038,"38_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],"[['statistical_maximum'], ['statistical_minimum']]","[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00038_38_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],ti_2022-12-10_13-40-38" -39,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.42234206199646,False,,,1,4d3c5b13801a45b4987f8e2781de9b7c,2022-12-10_13-40-44,1670679644,5.42234206199646,35978,wLLGv5,172.17.0.6,5.42234206199646,0,1,05589_00039,"39_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_ae']],[['statistical_minimum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00039_39_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['tim_2022-12-10_13-40-39" -40,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.615044116973877,False,,,1,9ad6238e757947ed9ec48458fe788563,2022-12-10_13-40-45,1670679645,0.615044116973877,35978,wLLGv5,172.17.0.6,0.615044116973877,0,1,05589_00040,"40_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_loda']],[['statistical_minimum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00040_40_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['t_2022-12-10_13-40-45" -41,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5040719509124756,False,,,1,7f6ca27186f742e4ba0418f4ed5a1175,2022-12-10_13-40-46,1670679646,0.5040719509124756,35978,wLLGv5,172.17.0.6,0.5040719509124756,0,1,05589_00041,"41_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]",[['pyod_cof']],[['statistical_minimum']],"[['time_series_seasonality_trend_decomposition'], ['moving_average_transform']]","/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00041_41_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['ti_2022-12-10_13-40-45" -42,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.575361251831055,False,,,1,d377f09059674112b5e83c335897f87e,2022-12-10_13-40-51,1670679651,5.575361251831055,35978,wLLGv5,172.17.0.6,5.575361251831055,0,1,05589_00042,"42_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],[['statistical_h_mean']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00042_42_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['movi_2022-12-10_13-40-46" -43,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4793574810028076,False,,,1,1274e6c7ab99430e950c71420230c369,2022-12-10_13-40-52,1670679652,0.4793574810028076,35978,wLLGv5,172.17.0.6,0.4793574810028076,0,1,05589_00043,"43_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],[['statistical_h_mean']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00043_43_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['mo_2022-12-10_13-40-51" -44,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.3718013763427734,False,,,1,7a2d2547e7804e52a2e66498e8d573ff,2022-12-10_13-40-52,1670679652,0.3718013763427734,35978,wLLGv5,172.17.0.6,0.3718013763427734,0,1,05589_00044,"44_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],[['statistical_h_mean']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00044_44_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean']],timeseries_processing=[['mov_2022-12-10_13-40-52" -45,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.289066791534424,False,,,1,9b9a899fcb5a401c907322d06607c3b1,2022-12-10_13-40-58,1670679658,5.289066791534424,35978,wLLGv5,172.17.0.6,5.289066791534424,0,1,05589_00045,"45_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00045_45_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],time_2022-12-10_13-40-52" -46,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5115981101989746,False,,,1,f76d5b8e767e4fcbb22a266168e09246,2022-12-10_13-40-58,1670679658,0.5115981101989746,35978,wLLGv5,172.17.0.6,0.5115981101989746,0,1,05589_00046,"46_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00046_46_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],ti_2022-12-10_13-40-58" -47,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4410786628723144,False,,,1,a58eb4d4009e44af8ab6918bd39c572d,2022-12-10_13-40-59,1670679659,0.4410786628723144,35978,wLLGv5,172.17.0.6,0.4410786628723144,0,1,05589_00047,"47_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00047_47_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum']],tim_2022-12-10_13-40-58" -48,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.416203737258911,False,,,1,7e3d1318be3e433fab36fad2be812e3f,2022-12-10_13-41-04,1670679664,5.416203737258911,35978,wLLGv5,172.17.0.6,5.416203737258911,0,1,05589_00048,"48_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00048_48_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['st_2022-12-10_13-40-59" -49,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,1.1001946926116943,False,,,1,c9b918a8ac4643289f62bf9cc058263f,2022-12-10_13-41-05,1670679665,1.1001946926116943,35978,wLLGv5,172.17.0.6,1.1001946926116943,0,1,05589_00049,"49_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00049_49_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['_2022-12-10_13-41-04" -50,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5794816017150879,False,,,1,8ea2e8a1da184a4cbeda2d333acc0b9f,2022-12-10_13-41-06,1670679666,0.5794816017150879,35978,wLLGv5,172.17.0.6,0.5794816017150879,0,1,05589_00050,"50_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00050_50_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_maximum'], ['s_2022-12-10_13-41-05" -51,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.390760660171509,False,,,1,419bf9b1a73e43369c7d319dab616507,2022-12-10_13-41-11,1670679671,5.390760660171509,35978,wLLGv5,172.17.0.6,5.390760660171509,0,1,05589_00051,"51_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],"[['statistical_h_mean'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00051_51_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],time_2022-12-10_13-41-06" -52,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5620133876800537,False,,,1,b6b0b90b50954dd5a74e2fadc8d5ee66,2022-12-10_13-41-12,1670679672,0.5620133876800537,35978,wLLGv5,172.17.0.6,0.5620133876800537,0,1,05589_00052,"52_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],"[['statistical_h_mean'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00052_52_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],ti_2022-12-10_13-41-12" -53,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4495429992675781,False,,,1,79432b9f65324bc398c41e5caf9be2e5,2022-12-10_13-41-13,1670679673,0.4495429992675781,35978,wLLGv5,172.17.0.6,0.4495429992675781,0,1,05589_00053,"53_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],"[['statistical_h_mean'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00053_53_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_h_mean'], ['statistical_minimum']],tim_2022-12-10_13-41-12" -54,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.084603548049927,False,,,1,822f7fba685240b4ab07b74452cac036,2022-12-10_13-41-18,1670679678,5.084603548049927,35978,wLLGv5,172.17.0.6,5.084603548049927,0,1,05589_00054,"54_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],[['statistical_maximum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00054_54_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['mov_2022-12-10_13-41-13" -55,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4793636798858642,False,,,1,5b39fd548a7a4106a27d084f19043e07,2022-12-10_13-41-18,1670679678,0.4793636798858642,35978,wLLGv5,172.17.0.6,0.4793636798858642,0,1,05589_00055,"55_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],[['statistical_maximum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00055_55_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['m_2022-12-10_13-41-18" -56,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.3846521377563476,False,,,1,4b3132cf24e74f78bad06f5a154732ae,2022-12-10_13-41-19,1670679679,0.3846521377563476,35978,wLLGv5,172.17.0.6,0.3846521377563476,0,1,05589_00056,"56_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],[['statistical_maximum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00056_56_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['mo_2022-12-10_13-41-18" -57,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.610422849655151,False,,,1,8f7917b4628a42d1b30c1b557eb503a1,2022-12-10_13-41-25,1670679685,5.610422849655151,35978,wLLGv5,172.17.0.6,5.610422849655151,0,1,05589_00057,"57_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],"[['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00057_57_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],tim_2022-12-10_13-41-19" -58,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.5524132251739502,False,,,1,6471e569bf8c4e54ad89a51e04749a6a,2022-12-10_13-41-25,1670679685,0.5524132251739502,35978,wLLGv5,172.17.0.6,0.5524132251739502,0,1,05589_00058,"58_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],"[['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00058_58_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],t_2022-12-10_13-41-25" -59,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4558494091033935,False,,,1,fe4008aedfda416192efc1188138d08b,2022-12-10_13-41-26,1670679686,0.4558494091033935,35978,wLLGv5,172.17.0.6,0.4558494091033935,0,1,05589_00059,"59_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],"[['statistical_maximum'], ['statistical_minimum']]",[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00059_59_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],ti_2022-12-10_13-41-25" -60,0.4864864864864865,0.0,0.0,0.9473684210526316,0.4864864864864865,5.233817100524902,False,,,1,74038b67e4414a29aed48291a83035b1,2022-12-10_13-41-31,1670679691,5.233817100524902,35978,wLLGv5,172.17.0.6,5.233817100524902,0,1,05589_00060,"60_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_ae']],[['statistical_minimum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00060_60_detection_algorithm=[['pyod_ae']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['mov_2022-12-10_13-41-26" -61,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.4943070411682129,False,,,1,1592ec6286c5410ca52d07edced6653a,2022-12-10_13-41-32,1670679692,0.4943070411682129,35978,wLLGv5,172.17.0.6,0.4943070411682129,0,1,05589_00061,"61_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_loda']],[['statistical_minimum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00061_61_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['m_2022-12-10_13-41-31" -62,0.4722222222222222,0.0,0.0,0.8947368421052632,0.4722222222222222,0.3827309608459472,False,,,1,137581c0aa154ee88504d189bf323049,2022-12-10_13-41-32,1670679692,0.3827309608459472,35978,wLLGv5,172.17.0.6,0.3827309608459472,0,1,05589_00062,"62_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['moving_average_transform']]",[['pyod_cof']],[['statistical_minimum']],[['moving_average_transform']],"/root/ray_results/_evaluate_2022-12-10_13-39-07/_evaluate_05589_00062_62_detection_algorithm=[['pyod_cof']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['mo_2022-12-10_13-41-32" diff --git a/tods-doc-sphinx b/tods-doc-sphinx deleted file mode 160000 index 798ed444..00000000 --- a/tods-doc-sphinx +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 798ed444ce3c3912b66445ee8efeb73e6af6e5ff diff --git a/tods/detection_algorithm/core/AutoRegOD.py b/tods/detection_algorithm/core/AutoRegOD.py deleted file mode 100644 index a2286dca..00000000 --- a/tods/detection_algorithm/core/AutoRegOD.py +++ /dev/null @@ -1,198 +0,0 @@ -# -*- coding: utf-8 -*- -"""Autoregressive model for univariate time series outlier detection. -""" -import numpy as np -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted -from sklearn.linear_model import LinearRegression - -from .CollectiveBase import CollectiveBaseDetector - -from .utility import get_sub_matrices - - -class AutoRegOD(CollectiveBaseDetector): - """Autoregressive models use linear regression to calculate a sample's - deviance from the predicted value, which is then used as its - outlier scores. This model is for univariate time series. - See MultiAutoRegOD for multivariate data. - - See :cite:`aggarwal2015outlier` Chapter 9 for details. - - Parameters - ---------- - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, i.e. - the proportion of outliers in the data set. When fitting this is used - to define the threshold on the decision function. - - Attributes - ---------- - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, window_size, step_size=1, contamination=0.1): - super(AutoRegOD, self).__init__(contamination=contamination) - self.window_size = window_size - self.step_size = step_size - - def fit(self, X: np.array) -> object: - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - X = check_array(X).astype(np.float) - - # generate X and y - sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( - X, - window_size=self.window_size, - step=self.step_size, - return_numpy=True, - flatten=True) - # remove the last one - sub_matrices = sub_matrices[:-1, :] - self.left_inds_ = self.left_inds_[:-1] - self.right_inds_ = self.right_inds_[:-1] - - self.valid_len_ = sub_matrices.shape[0] - - y_buf = np.zeros([self.valid_len_, 1]) - - for i in range(self.valid_len_): - y_buf[i] = X[i * self.step_size + self.window_size] - # print(sub_matrices.shape, y_buf.shape) - - # fit the linear regression model - self.lr_ = LinearRegression(fit_intercept=True) - self.lr_.fit(sub_matrices, y_buf) - self.decision_scores_ = np.absolute( - y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) - - self._process_decision_scores() - return self - - def predict(self, X): # pragma: no cover - """Predict if a particular sample is an outlier or not. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - Returns - ------- - outlier_labels : numpy array of shape (n_samples,) - For each observation, tells whether or not - it should be considered as an outlier according to the - fitted model. 0 stands for inliers and 1 for outliers. - """ - - check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) - - pred_score, X_left_inds, X_right_inds = self.decision_function(X) - - pred_score = np.concatenate((np.zeros((self.window_size,)), pred_score)) - X_left_inds = np.concatenate((np.zeros((self.window_size,)), X_left_inds)) - X_right_inds = np.concatenate((np.zeros((self.window_size,)), X_right_inds)) - - return (pred_score > self.threshold_).astype( - 'int').ravel(), X_left_inds.ravel(), X_right_inds.ravel() - - def decision_function(self, X: np.array): - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['lr_']) - - sub_matrices, X_left_inds, X_right_inds = \ - get_sub_matrices(X, - window_size=self.window_size, - step=self.step_size, - return_numpy=True, - flatten=True) - - # remove the last one - sub_matrices = sub_matrices[:-1, :] - X_left_inds = X_left_inds[:-1] - X_right_inds = X_right_inds[:-1] - - valid_len = sub_matrices.shape[0] - - y_buf = np.zeros([valid_len, 1]) - - for i in range(valid_len): - y_buf[i] = X[i * self.step_size + self.window_size] - - pred_score = np.absolute( - y_buf.ravel() - self.lr_.predict(sub_matrices).ravel()) - - return pred_score, X_left_inds.ravel(), X_right_inds.ravel() - - -if __name__ == "__main__": # pragma: no cover - X_train = np.asarray( - [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, - 100]).reshape(-1, 1) - - X_test = np.asarray( - [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, - 1) - - clf = AutoRegOD(window_size=3, contamination=0.2) - clf.fit(X_train) - decision_scores, left_inds_, right_inds = clf.decision_scores_, \ - clf.left_inds_, clf.right_inds_ - print(clf.left_inds_, clf.right_inds_) - pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) - pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) - pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) - - print(pred_scores) - print(pred_labels) - print(pred_probs) diff --git a/tods/detection_algorithm/core/CollectiveBase.py b/tods/detection_algorithm/core/CollectiveBase.py deleted file mode 100644 index 67207bfe..00000000 --- a/tods/detection_algorithm/core/CollectiveBase.py +++ /dev/null @@ -1,476 +0,0 @@ -# -*- coding: utf-8 -*- -"""Base class for all Collective outlier detector models -""" - -from __future__ import division -from __future__ import print_function - -import warnings -from collections import defaultdict - -from inspect import signature - -import abc -from abc import ABCMeta - -import numpy as np -from numpy import percentile -from scipy.special import erf -from sklearn.preprocessing import MinMaxScaler -from sklearn.utils import deprecated -from sklearn.utils.validation import check_is_fitted -from sklearn.utils.multiclass import check_classification_targets - - -def _pprint(params, offset=0, printer=repr): # pragma: no cover - # noinspection PyPep8 - """Pretty print the dictionary 'params' - - See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html - and sklearn/base.py for more information. - - :param params: The dictionary to pretty print - :type params: dict - - :param offset: The offset in characters to add at the begin of each line. - :type offset: int - - :param printer: The function to convert entries to strings, typically - the builtin str or repr - :type printer: callable - - :return: None - """ - - # Do a multi-line justified repr: - options = np.get_printoptions() - np.set_printoptions(precision=5, threshold=64, edgeitems=2) - params_list = list() - this_line_length = offset - line_sep = ',\n' + (1 + offset // 2) * ' ' - for i, (k, v) in enumerate(sorted(params.items())): - if type(v) is float: - # use str for representing floating point numbers - # this way we get consistent representation across - # architectures and versions. - this_repr = '%s=%s' % (k, str(v)) - else: - # use repr of the rest - this_repr = '%s=%s' % (k, printer(v)) - if len(this_repr) > 500: - this_repr = this_repr[:300] + '...' + this_repr[-100:] - if i > 0: - if this_line_length + len(this_repr) >= 75 or '\n' in this_repr: - params_list.append(line_sep) - this_line_length = len(line_sep) - else: - params_list.append(', ') - this_line_length += 2 - params_list.append(this_repr) - this_line_length += len(this_repr) - - np.set_printoptions(**options) - lines = ''.join(params_list) - # Strip trailing space to avoid nightmare in doctests - lines = '\n'.join(l.rstrip(' ') for l in lines.split('\n')) - return lines - - -class CollectiveBaseDetector(metaclass=ABCMeta): - """Abstract class for all outlier detection algorithms. - - Parameters - ---------- - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, - i.e. the proportion of outliers in the data set. Used when fitting to - define the threshold on the decision function. - - window_size : int, optional (default=1) - The moving window size. - - step_size :, optional (default=1) - The displacement for moving window. - - Attributes - ---------- - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - @abc.abstractmethod - def __init__(self, contamination=0.1, - window_size=1, - step_size=1): # pragma: no cover - - if not (0. < contamination <= 0.5): - raise ValueError("contamination must be in (0, 0.5], " - "got: %f" % contamination) - - self.contamination = contamination - self.window_size = window_size - self.step_size = step_size - self._classes = 2 # leave the parameter on for extension - self.left_inds_ = None - self.right_inds = None - - # noinspection PyIncorrectDocstring - @abc.abstractmethod - def fit(self, X, y=None): # pragma: no cover - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - pass - - @abc.abstractmethod - def decision_function(self, X): # pragma: no cover - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - pass - - @deprecated() - def fit_predict(self, X, y=None): # pragma: no cover - """Fit detector first and then predict whether a particular sample - is an outlier or not. y is ignored in unsupervised models. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - outlier_labels : numpy array of shape (n_samples,) - For each observation, tells whether or not - it should be considered as an outlier according to the - fitted model. 0 stands for inliers and 1 for outliers. - - .. deprecated:: 0.6.9 - `fit_predict` will be removed in pyod 0.8.0.; it will be - replaced by calling `fit` function first and then accessing - `labels_` attribute for consistency. - """ - - self.fit(X, y) - return self.labels_ - - def predict(self, X): # pragma: no cover - """Predict if a particular sample is an outlier or not. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - Returns - ------- - outlier_labels : numpy array of shape (n_samples,) - For each observation, tells whether or not - it should be considered as an outlier according to the - fitted model. 0 stands for inliers and 1 for outliers. - """ - - check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) - - pred_score, X_left_inds, X_right_inds = self.decision_function(X) - - return (pred_score > self.threshold_).astype( - 'int').ravel(), X_left_inds.ravel(), X_right_inds.ravel() - - def predict_proba(self, X, method='linear'): # pragma: no cover - """Predict the probability of a sample being outlier. Two approaches - are possible: - - 1. simply use Min-max conversion to linearly transform the outlier - scores into the range of [0,1]. The model must be - fitted first. - 2. use unifying scores, see :cite:`kriegel2011interpreting`. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - method : str, optional (default='linear') - probability conversion method. It must be one of - 'linear' or 'unify'. - - Returns - ------- - outlier_probability : numpy array of shape (n_samples,) - For each observation, tells whether or not - it should be considered as an outlier according to the - fitted model. Return the outlier probability, ranging - in [0,1]. - """ - - check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_']) - train_scores = self.decision_scores_ - - test_scores, X_left_inds, X_right_inds = self.decision_function(X) - - probs = np.zeros([test_scores.shape[0], int(self._classes)]) - if method == 'linear': - scaler = MinMaxScaler().fit(train_scores.reshape(-1, 1)) - probs[:, 1] = scaler.transform( - test_scores.reshape(-1, 1)).ravel().clip(0, 1) - probs[:, 0] = 1 - probs[:, 1] - return probs, X_left_inds.ravel(), X_right_inds.ravel() - - elif method == 'unify': - # turn output into probability - pre_erf_score = (test_scores - self._mu) / ( - self._sigma * np.sqrt(2)) - erf_score = erf(pre_erf_score) - probs[:, 1] = erf_score.clip(0, 1).ravel() - probs[:, 0] = 1 - probs[:, 1] - return probs, X_left_inds.ravel(), X_right_inds.ravel() - else: - raise ValueError(method, - 'is not a valid probability conversion method') - - def _predict_rank(self, X, normalized=False): # pragma: no cover - """Predict the outlyingness rank of a sample by a fitted model. The - method is for outlier detector score combination. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - normalized : bool, optional (default=False) - If set to True, all ranks are normalized to [0,1]. - - Returns - ------- - ranks : array, shape (n_samples,) - Outlying rank of a sample according to the training data. - - """ - - check_is_fitted(self, ['decision_scores_']) - - test_scores = self.decision_function(X) - train_scores = self.decision_scores_ - - sorted_train_scores = np.sort(train_scores) - ranks = np.searchsorted(sorted_train_scores, test_scores) - - if normalized: - # return normalized ranks - ranks = ranks / ranks.max() - return ranks - - def _set_n_classes(self, y): # pragma: no cover - """Set the number of classes if `y` is presented, which is not - expected. It could be useful for multi-class outlier detection. - - Parameters - ---------- - y : numpy array of shape (n_samples,) - Ground truth. - - Returns - ------- - self - """ - - self._classes = 2 # default as binary classification - if y is not None: - check_classification_targets(y) - self._classes = len(np.unique(y)) - warnings.warn( - "y should not be presented in unsupervised learning.") - return self - - def _process_decision_scores(self): # pragma: no cover - """Internal function to calculate key attributes: - - - threshold_: used to decide the binary label - - labels_: binary labels of training data - - Returns - ------- - self - """ - - self.threshold_ = percentile(self.decision_scores_, - 100 * (1 - self.contamination)) - self.labels_ = (self.decision_scores_ > self.threshold_).astype( - 'int').ravel() - - # calculate for predict_proba() - - self._mu = np.mean(self.decision_scores_) - self._sigma = np.std(self.decision_scores_) - - return self - - # noinspection PyMethodParameters - def _get_param_names(cls): # pragma: no cover - # noinspection PyPep8 - """Get parameter names for the estimator - - See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html - and sklearn/base.py for more information. - """ - - # fetch the constructor or the original constructor before - # deprecation wrapping if any - init = getattr(cls.__init__, 'deprecated_original', cls.__init__) - if init is object.__init__: - # No explicit constructor to introspect - return [] - - # introspect the constructor arguments to find the model parameters - # to represent - init_signature = signature(init) - # Consider the constructor parameters excluding 'self' - parameters = [p for p in init_signature.parameters.values() - if p.name != 'self' and p.kind != p.VAR_KEYWORD] - for p in parameters: - if p.kind == p.VAR_POSITIONAL: - raise RuntimeError("scikit-learn estimators should always " - "specify their parameters in the signature" - " of their __init__ (no varargs)." - " %s with constructor %s doesn't " - " follow this convention." - % (cls, init_signature)) - # Extract and sort argument names excluding 'self' - return sorted([p.name for p in parameters]) - - # noinspection PyPep8 - def get_params(self, deep=True): # pragma: no cover - """Get parameters for this estimator. - - See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html - and sklearn/base.py for more information. - - Parameters - ---------- - deep : bool, optional (default=True) - If True, will return the parameters for this estimator and - contained subobjects that are estimators. - - Returns - ------- - params : mapping of string to any - Parameter names mapped to their values. - """ - - out = dict() - for key in self._get_param_names(): - # We need deprecation warnings to always be on in order to - # catch deprecated param values. - # This is set in utils/__init__.py but it gets overwritten - # when running under python3 somehow. - warnings.simplefilter("always", DeprecationWarning) - try: - with warnings.catch_warnings(record=True) as w: - value = getattr(self, key, None) - if len(w) and w[0].category == DeprecationWarning: - # if the parameter is deprecated, don't show it - continue - finally: - warnings.filters.pop(0) - - # XXX: should we rather test if instance of estimator? - if deep and hasattr(value, 'get_params'): - deep_items = value.get_params().items() - out.update((key + '__' + k, val) for k, val in deep_items) - out[key] = value - return out - - def set_params(self, **params): # pragma: no cover - # noinspection PyPep8 - """Set the parameters of this estimator. - The method works on simple estimators as well as on nested objects - (such as pipelines). The latter have parameters of the form - ``__`` so that it's possible to update each - component of a nested object. - - See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html - and sklearn/base.py for more information. - - Returns - ------- - self : object - """ - - if not params: - # Simple optimization to gain speed (inspect is slow) - return self - valid_params = self.get_params(deep=True) - - nested_params = defaultdict(dict) # grouped by prefix - for key, value in params.items(): - key, delim, sub_key = key.partition('__') - if key not in valid_params: - raise ValueError('Invalid parameter %s for estimator %s. ' - 'Check the list of available parameters ' - 'with `estimator.get_params().keys()`.' % - (key, self)) - - if delim: - nested_params[key][sub_key] = value - else: - setattr(self, key, value) - - for key, sub_params in nested_params.items(): - valid_params[key].set_params(**sub_params) - - return self - - def __repr__(self): # pragma: no cover - # noinspection PyPep8 - """ - See http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html - and sklearn/base.py for more information. - """ - - class_name = self.__class__.__name__ - return '%s(%s)' % (class_name, _pprint(self.get_params(deep=False), - offset=len(class_name), ),) diff --git a/tods/detection_algorithm/core/CollectiveCommonTest.py b/tods/detection_algorithm/core/CollectiveCommonTest.py deleted file mode 100755 index 1eb6b986..00000000 --- a/tods/detection_algorithm/core/CollectiveCommonTest.py +++ /dev/null @@ -1,174 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import division -from __future__ import print_function - -import os -import sys - -import numpy as np -import unittest -# noinspection PyProtectedMember - -from numpy.testing import assert_equal -from numpy.testing import assert_allclose -from numpy.testing import assert_array_less -from numpy.testing import assert_raises - -from unittest import TestCase - -from sklearn.utils.estimator_checks import check_estimator - -from sklearn.metrics import roc_auc_score -from scipy.stats import rankdata - -# temporary solution for relative imports in case pyod is not installed -# if pyod is installed, no need to use the following line -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from pyod.utils.data import generate_data - -_dummy = TestCase('__init__') -assert_greater = _dummy.assertGreater -assert_greater_equal = _dummy.assertGreaterEqual -assert_less = _dummy.assertLess -assert_less_equal = _dummy.assertLessEqual - - -class CollectiveCommonTest: - def __init__(self, - model, - X_train, - y_train, - X_test, - y_test, - roc_floor, - ): - self.clf = model - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - self.roc_floor = roc_floor - - self.clf.fit(self.X_train) - - pass - - def test_detector(self): - - self.test_parameters() - self.test_train_scores() - self.test_train_inds() - self.test_prediction_scores() - self.test_prediction_proba() - self.test_prediction_proba_linear() - self.test_prediction_proba_unify() - self.test_prediction_proba_parameter() - # self.test_fit_predict() - # self.test_fit_predict_score() - self.test_prediction_labels() - self.test_prediction_inds() - # self.test_predict_rank() - # self.test_predict_rank_normalized() - self.tearDown() - - def test_parameters(self): - assert (hasattr(self.clf, 'decision_scores_') and - self.clf.decision_scores_ is not None) - assert (hasattr(self.clf, 'labels_') and - self.clf.labels_ is not None) - assert (hasattr(self.clf, 'threshold_') and - self.clf.threshold_ is not None) - assert (hasattr(self.clf, 'left_inds_') and - self.clf.left_inds_ is not None) - assert (hasattr(self.clf, 'right_inds_') and - self.clf.right_inds_ is not None) - assert (hasattr(self.clf, '_mu') and - self.clf._mu is not None) - assert (hasattr(self.clf, '_sigma') and - self.clf._sigma is not None) - - def test_train_scores(self): - assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) - - def test_train_inds(self): - inds_valid = self.clf.left_inds_ < self.clf.right_inds_ - assert_equal(self.clf.left_inds_.shape, self.clf.decision_scores_.shape) - assert_equal(self.clf.right_inds_.shape, self.clf.decision_scores_.shape) - assert_equal(all(inds_valid), True) - - def test_prediction_scores(self): - pred_scores, _, _ = self.clf.decision_function(self.X_test) - # check score shapes - assert_equal(pred_scores.shape[0], self.y_test.shape[0]) - - # check performance - assert_greater(roc_auc_score(self.y_test, pred_scores), self.roc_floor) - - def test_prediction_labels(self): - pred_labels, _, _ = self.clf.predict(self.X_test) - assert_equal(pred_labels.shape, self.y_test.shape) - - def test_prediction_inds(self): - _, left_inds, right_inds = self.clf.predict(self.X_test) - inds_valid = left_inds < right_inds - - assert_equal(left_inds.shape, self.y_test.shape) - assert_equal(right_inds.shape, self.y_test.shape) - assert_equal(all(inds_valid), True) - - - def test_prediction_proba(self): - pred_proba, _, _ = self.clf.predict_proba(self.X_test) - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_linear(self): - pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='linear') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_unify(self): - pred_proba, _, _ = self.clf.predict_proba(self.X_test, method='unify') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_parameter(self): - with assert_raises(ValueError): - self.clf.predict_proba(self.X_test, method='something') - - def test_fit_predict(self): # pragma: no cover - pred_labels, _, _ = self.clf.fit_predict(X=self.X_train) - assert_equal(pred_labels.shape, self.y_train.shape) - - def test_fit_predict_score(self): # pragma: no cover - self.clf.fit_predict_score(self.X_test, self.y_test) - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='roc_auc_score') - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='prc_n_score') - with assert_raises(NotImplementedError): - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='something') - - def test_predict_rank(self): # pragma: no cover - pred_socres, _, _ = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, self.X_train.shape[0] + 1) - assert_array_less(-0.1, pred_ranks) - - def test_predict_rank_normalized(self): # pragma: no cover - pred_socres, _, _ = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, 1.01) - assert_array_less(-0.1, pred_ranks) - - def tearDown(self): - pass diff --git a/tods/detection_algorithm/core/KDiscord.py b/tods/detection_algorithm/core/KDiscord.py deleted file mode 100644 index ff74a217..00000000 --- a/tods/detection_algorithm/core/KDiscord.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -"""Autoregressive model for multivariate time series outlier detection. -""" -import numpy as np -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted - -from .CollectiveBase import CollectiveBaseDetector -from pyod.models.knn import KNN - -from .utility import get_sub_matrices - - -# TODO: add an argument to exclude "near equal" samples -# TODO: another thought is to treat each dimension independent -class KDiscord(CollectiveBaseDetector): - """KDiscord first split multivariate time series into - subsequences (matrices), and it use kNN outlier detection based on PyOD. - For an observation, its distance to its kth nearest neighbor could be - viewed as the outlying score. It could be viewed as a way to measure - the density. See :cite:`ramaswamy2000efficient,angiulli2002fast` for - details. - - See :cite:`aggarwal2015outlier,zhao2020using` for details. - - Parameters - ---------- - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, - i.e. the proportion of outliers in the data set. Used when fitting to - define the threshold on the decision function. - - n_neighbors : int, optional (default = 5) - Number of neighbors to use by default for k neighbors queries. - - method : str, optional (default='largest') - {'largest', 'mean', 'median'} - - - 'largest': use the distance to the kth neighbor as the outlier score - - 'mean': use the average of all k neighbors as the outlier score - - 'median': use the median of the distance to k neighbors as the - outlier score - - radius : float, optional (default = 1.0) - Range of parameter space to use by default for `radius_neighbors` - queries. - - algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, optional - Algorithm used to compute the nearest neighbors: - - - 'ball_tree' will use BallTree - - 'kd_tree' will use KDTree - - 'brute' will use a brute-force search. - - 'auto' will attempt to decide the most appropriate algorithm - based on the values passed to :meth:`fit` method. - - Note: fitting on sparse input will override the setting of - this parameter, using brute force. - - .. deprecated:: 0.74 - ``algorithm`` is deprecated in PyOD 0.7.4 and will not be - possible in 0.7.6. It has to use BallTree for consistency. - - leaf_size : int, optional (default = 30) - Leaf size passed to BallTree. This can affect the - speed of the construction and query, as well as the memory - required to store the tree. The optimal value depends on the - nature of the problem. - - metric : string or callable, default 'minkowski' - metric to use for distance computation. Any metric from scikit-learn - or scipy.spatial.distance can be used. - - If metric is a callable function, it is called on each - pair of instances (rows) and the resulting value recorded. The callable - should take two arrays as input and return one value indicating the - distance between them. This works for Scipy's metrics, but is less - efficient than passing the metric name as a string. - - Distance matrices are not supported. - - Valid values for metric are: - - - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', - 'manhattan'] - - - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', - 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', - 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', - 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', - 'sqeuclidean', 'yule'] - - See the documentation for scipy.spatial.distance for details on these - metrics. - - p : integer, optional (default = 2) - Parameter for the Minkowski metric from - sklearn.metrics.pairwise.pairwise_distances. When p = 1, this is - equivalent to using manhattan_distance (l1), and euclidean_distance - (l2) for p = 2. For arbitrary p, minkowski_distance (l_p) is used. - See http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.pairwise_distances - - metric_params : dict, optional (default = None) - Additional keyword arguments for the metric function. - - n_jobs : int, optional (default = 1) - The number of parallel jobs to run for neighbors search. - If ``-1``, then the number of jobs is set to the number of CPU cores. - Affects only kneighbors and kneighbors_graph methods. - - Attributes - ---------- - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is - fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, window_size, step_size=1, contamination=0.1, - n_neighbors=5, method='largest', - radius=1.0, algorithm='auto', leaf_size=30, - metric='minkowski', p=2, metric_params=None, n_jobs=1, - **kwargs): - super(KDiscord, self).__init__(contamination=contamination) - self.window_size = window_size - self.step_size = step_size - - # parameters for kNN - self.n_neighbors = n_neighbors - self.method = method - self.radius = radius - self.algorithm = algorithm - self.leaf_size = leaf_size - self.metric = metric - self.p = p - self.metric_params = metric_params - self.n_jobs = n_jobs - - # initialize a kNN model - self.model_ = KNN(contamination=self.contamination, - n_neighbors=self.n_neighbors, - radius=self.radius, - algorithm=self.algorithm, - leaf_size=self.leaf_size, - metric=self.metric, - p=self.p, - metric_params=self.metric_params, - n_jobs=self.n_jobs, - **kwargs) - - def fit(self, X: np.array) -> object: - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - X = check_array(X).astype(np.float) - - # first convert it into submatrices, and flatten it - sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( - X, - self.window_size, - self.step_size, - return_numpy=True, - flatten=True) - - # fit the kNN model - self.model_.fit(sub_matrices) - self.decision_scores_ = self.model_.decision_scores_ - self._process_decision_scores() - return self - - def decision_function(self, X: np.array): - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['model_']) - X = check_array(X).astype(np.float) - # first convert it into submatrices, and flatten it - sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( - X, - self.window_size, - self.step_size, - return_numpy=True, - flatten=True) - - # return the prediction result by kNN - return self.model_.decision_function(sub_matrices), \ - X_left_inds.ravel(), X_right_inds.ravel() - - -if __name__ == "__main__": # pragma: no cover - X_train = np.asarray( - [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, - 100]).reshape(-1, 1) - - X_test = np.asarray( - [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, - 1) - - # X_train = np.asarray( - # [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], - # [12., 12], - # [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) - # - # X_test = np.asarray( - # [[12., 10], [8., 12], [80., 80], [92., 983], - # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], - # [22., 15]]) - - clf = KDiscord(window_size=3, step_size=1, contamination=0.2, - n_neighbors=5) - - clf.fit(X_train) - decision_scores, left_inds_, right_inds = clf.decision_scores_, \ - clf.left_inds_, clf.right_inds_ - print(clf.left_inds_, clf.right_inds_) - pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) - pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) - pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) - - print(pred_scores) - print(pred_labels) - print(pred_probs) diff --git a/tods/detection_algorithm/core/LSTMOD.py b/tods/detection_algorithm/core/LSTMOD.py deleted file mode 100644 index ee0cbaad..00000000 --- a/tods/detection_algorithm/core/LSTMOD.py +++ /dev/null @@ -1,266 +0,0 @@ -# -*- coding: utf-8 -*- -"""Autoregressive model for univariate time series outlier detection. -""" -import numpy as np -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted -from scipy.special import erf -from sklearn.preprocessing import MinMaxScaler - -from .CollectiveBase import CollectiveBaseDetector - -# from tod.utility import get_sub_matrices - -from tensorflow.keras.layers import Dense, LSTM -from tensorflow.keras.models import Sequential - -class LSTMOutlierDetector(CollectiveBaseDetector): - - def __init__(self,contamination=0.1, - train_contamination=0.0, - min_attack_time=5, - danger_coefficient_weight=0.5, - loss='mean_squared_error', - optimizer='adam', - epochs=10, - batch_size=8, - dropout_rate=0.0, - feature_dim=9, - hidden_dim=1, - n_hidden_layer=0, - activation=None, - diff_group_method='average' - ): - - super(LSTMOutlierDetector, self).__init__(contamination=contamination, - # window_size=min_attack_time, - step_size=1, - ) - - self.train_contamination = train_contamination - self.min_attack_time = min_attack_time - self.danger_coefficient_weight = danger_coefficient_weight - self.relative_error_threshold = None - - self.loss = loss - self.optimizer = optimizer - self.epochs = epochs - self.batch_size = batch_size - self.dropout_rate = dropout_rate - self.feature_dim = feature_dim - self.hidden_dim = hidden_dim - self.n_hidden_layer = n_hidden_layer - self.diff_group_method = diff_group_method - self.activation = activation - - - # def _build_model(self): - # print('dim:', self.hidden_dim, self.feature_dim) - # model_ = Sequential() - # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.feature_dim, 1), - # dropout=self.dropout_rate, activation=self.activation, return_sequences=True)) - - # for layer_idx in range(self.n_hidden_layer-1): - # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), - # dropout=self.dropout_rate, activation=self.activation, return_sequences=True)) - - # model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), - # dropout=self.dropout_rate, activation=self.activation)) - - # model_.add(Dense(units=self.feature_dim, input_shape=(self.hidden_dim, 1), activation=None)) - - # model_.compile(loss=self.loss, optimizer=self.optimizer) - # return model_ - - def _build_model(self): - model_ = Sequential() - model_.add(LSTM(units=self.hidden_dim, input_shape=(self.feature_dim, 1), - dropout=self.dropout_rate, activation=self.activation, - return_sequences=bool(self.n_hidden_layer>0))) - - for layer_idx in range(self.n_hidden_layer): - model_.add(LSTM(units=self.hidden_dim, input_shape=(self.hidden_dim, 1), - dropout=self.dropout_rate, activation=self.activation, - return_sequences=bool(layer_idx < self.n_hidden_layer - 1))) - - model_.add(Dense(units=self.feature_dim, input_shape=(self.hidden_dim, 1), activation=None)) - - model_.compile(loss=self.loss, optimizer=self.optimizer) - return model_ - - def fit(self, X: np.array, y=None) -> object: - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - print("XXXX:", X.shape) - X = check_array(X).astype(np.float) - self._set_n_classes(None) - X_buf, y_buf = self._get_sub_matrices(X) - self.feature_dim = X_buf.shape[1] - self.model_ = self._build_model() - - # fit the LSTM model - self.model_.fit(X_buf, y_buf, epochs=self.epochs, batch_size=self.batch_size) - - relative_error = self._relative_error(X) - - if self.train_contamination < 1e-6: - self.relative_error_threshold = max(relative_error) - else: - self.relative_error_threshold = np.percentile(relative_error, 100 * (1 - self.train_contamination)) - - self.decision_scores_, self.left_inds_, self.right_inds_ = self.decision_function(X) - self._process_decision_scores() - - return self - - def _get_sub_matrices(self, X: np.array): - # return X[:-1].reshape(-1, 1, self.feature_dim), X[1:] - return np.expand_dims(X[:-1], axis=2), X[1:] - - - def _relative_error(self, X: np.array): - - X = check_array(X).astype(np.float) - X_buf, y_buf = self._get_sub_matrices(X) - - y_predict = self.model_.predict(X_buf) - - relative_error = (np.linalg.norm(y_predict - y_buf, axis=1) / np.linalg.norm(y_buf + 1e-6, axis=1)).ravel() - - return relative_error - - - def decision_function(self, X: np.array): - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['model_']) - - relative_error = self._relative_error(X) - - error_num_buf = (relative_error > self.relative_error_threshold).astype(int) - - if not (self.diff_group_method in ['max', 'min', 'average']): - raise ValueError(self.diff_group_method, "is not a valid method") - - relative_error_left_inds = np.ones((len(relative_error), )) * len(relative_error) - relative_error_right_inds = np.zeros((len(relative_error), )) - - - if self.diff_group_method == 'average': - danger_coefficient = np.zeros(relative_error.shape) - averaged_relative_error = np.zeros(relative_error.shape) - calculated_times = np.zeros(relative_error.shape) - - for i in range(len(relative_error) - self.min_attack_time + 1): - dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time - are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time - - for j in range(self.min_attack_time): - averaged_relative_error[i + j] += are_tmp - danger_coefficient[i + j] += dc_tmp - calculated_times[i + j] += 1 - relative_error_left_inds[i + j] = i if i < relative_error_left_inds[i + j] else relative_error_left_inds[i + j] - relative_error_right_inds[i + j] = i+self.min_attack_time if i+self.min_attack_time > relative_error_right_inds[i + j] else relative_error_left_inds[i + j] - - # print(calculated_times) - danger_coefficient /= calculated_times - averaged_relative_error /= calculated_times - # print(danger_coefficient, averaged_relative_error) - - - else: # pragma: no cover - danger_coefficient = np.zeros(relative_error.shape) - averaged_relative_error = np.zeros(relative_error.shape) - - if self.diff_group_method == 'min': - danger_coefficient += float('inf') - averaged_relative_error += float('inf') - - for i in range(len(relative_error) - self.min_attack_time + 1): - dc_tmp = error_num_buf[i:i+self.min_attack_time].sum() / self.min_attack_time - are_tmp = relative_error[i:i+self.min_attack_time].sum() / self.min_attack_time - - if self.diff_group_method == 'max': - for j in range(self.min_attack_time): - if are_tmp > averaged_relative_error[i + j] or dc_tmp > danger_coefficient[i+j]: - relative_error_left_inds[i + j] = i - relative_error_right_inds[i + j] = i+self.min_attack_time - if are_tmp > averaged_relative_error[i + j]: - averaged_relative_error[i + j] = are_tmp - if dc_tmp > danger_coefficient[i+j]: - danger_coefficient[i + j] = dc_tmp - - else: - for j in range(self.min_attack_time): - if are_tmp < averaged_relative_error[i + j] or dc_tmp < danger_coefficient[i+j]: - relative_error_left_inds[i + j] = i - relative_error_right_inds[i + j] = i+self.min_attack_time - if are_tmp < averaged_relative_error[i + j]: - averaged_relative_error[i + j] = are_tmp - if dc_tmp < danger_coefficient[i+j]: - danger_coefficient[i + j] = dc_tmp - - - # print(relative_error_left_inds) - # print(relative_error_right_inds) - pred_score = danger_coefficient * self.danger_coefficient_weight + averaged_relative_error * (1 - self.danger_coefficient_weight) - - pred_score = np.concatenate((np.zeros((self.window_size,)), pred_score)) - relative_error_left_inds = np.concatenate((np.arange(self.window_size), relative_error_left_inds+self.window_size)) - relative_error_right_inds = np.concatenate((np.arange(self.window_size)+self.window_size, relative_error_right_inds+self.window_size)) - - return pred_score, relative_error_left_inds, relative_error_right_inds - - -def main(): - X_train = np.asarray( - [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) - - X_test = np.asarray( - [3., 4., 8., 16.1, 18.2, 36.2, 57.1, -10.3, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1,1) - - print(X_train.shape, X_test.shape) - - clf = LSTMOutlierDetector(contamination=0.1) - clf.fit(X_train) - # pred_scores = clf.decision_function(X_test) - pred_labels, left_inds, right_inds = clf.predict(X_test) - - print(pred_labels.shape, left_inds.shape, right_inds.shape) - - print(clf.threshold_) - # print(np.percentile(pred_scores, 100 * 0.9)) - - # print('pred_scores: ',pred_scores) - print('pred_labels: ',pred_labels) - -if __name__ == "__main__": # pragma: no cover - main() diff --git a/tods/detection_algorithm/core/MultiAutoRegOD.py b/tods/detection_algorithm/core/MultiAutoRegOD.py deleted file mode 100644 index 01834bc9..00000000 --- a/tods/detection_algorithm/core/MultiAutoRegOD.py +++ /dev/null @@ -1,238 +0,0 @@ -# -*- coding: utf-8 -*- -"""Autoregressive model for multivariate time series outlier detection. -""" -import numpy as np -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted -from sklearn.utils import column_or_1d - -from .CollectiveBase import CollectiveBaseDetector -from combo.models.score_comb import average, maximization, median, aom, moa -from combo.utils.utility import standardizer - -from .AutoRegOD import AutoRegOD -from .utility import get_sub_sequences_length - - -class MultiAutoRegOD(CollectiveBaseDetector): - """Autoregressive models use linear regression to calculate a sample's - deviance from the predicted value, which is then used as its - outlier scores. This model is for multivariate time series. - This model handles multivariate time series by various combination - approaches. See AutoRegOD for univarite data. - - See :cite:`aggarwal2015outlier,zhao2020using` for details. - - Parameters - ---------- - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, i.e. - the proportion of outliers in the data set. When fitting this is used - to define the threshold on the decision function. - - method : str, optional (default='average') - Combination method: {'average', 'maximization', - 'median'}. Pass in weights of detector for weighted version. - - weights : numpy array of shape (1, n_dimensions) - Score weight by dimensions. - - Attributes - ---------- - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is - fitted. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, window_size, step_size=1, method='average', - weights=None, contamination=0.1): - super(MultiAutoRegOD, self).__init__(contamination=contamination) - self.window_size = window_size - self.step_size = step_size - self.method = method - self.weights = weights - - def _validate_weights(self): - """Internal function for validating and adjust weights. - - Returns - ------- - - """ - if self.weights is None: - self.weights = np.ones([1, self.n_models_]) - else: - self.weights = column_or_1d(self.weights).reshape( - 1, len(self.weights)) - assert (self.weights.shape[1] == self.n_models_) - - # adjust probability by a factor for integrity - adjust_factor = self.weights.shape[1] / np.sum(self.weights) - self.weights = self.weights * adjust_factor - - def _fit_univariate_model(self, X): - """Internal function for fitting one dimensional ts. - """ - X = check_array(X) - n_samples, n_sequences = X.shape[0], X.shape[1] - - models = [] - - # train one model for each dimension - for i in range(n_sequences): - models.append(AutoRegOD(window_size=self.window_size, - step_size=self.step_size, - contamination=self.contamination)) - models[i].fit(X[:, i].reshape(-1, 1)) - - return models - - def _score_combination(self, scores): # pragma: no cover - """Internal function for combining univarite scores. - """ - - # combine by different approaches - if self.method == 'average': - return average(scores, estimator_weights=self.weights) - if self.method == 'maximization': - return maximization(scores) - if self.method == 'median': - return median(scores) - - def fit(self, X: np.array) -> object: - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - X = check_array(X).astype(np.float) - - # fit each dimension individually - self.models_ = self._fit_univariate_model(X) - self.valid_len_ = self.models_[0].valid_len_ - self.n_models_ = len(self.models_) - - # assign the left and right inds, same for all models - self.left_inds_ = self.models_[0].left_inds_ - self.right_inds_ = self.models_[0].right_inds_ - - # validate and adjust weights - self._validate_weights() - - # combine the scores from all dimensions - self._decison_mat = np.zeros([self.valid_len_, self.n_models_]) - for i in range(self.n_models_): - self._decison_mat[:, i] = self.models_[i].decision_scores_ - - # scale scores by standardization before score combination - self._decison_mat_scalaled, self._score_scalar = standardizer( - self._decison_mat, keep_scalar=True) - - self.decision_scores_ = self._score_combination( - self._decison_mat_scalaled) - - # print(self.decision_scores_.shape, self.left_inds_.shape, self.right_inds_.shape) - self.decision_scores_ = np.concatenate((np.zeros((self.window_size,)), self.decision_scores_)) - self.left_inds_ = np.concatenate(((-self.window_size) * np.ones((self.window_size,)).astype(np.int), self.left_inds_)) - self.right_inds_ = np.concatenate((np.zeros((self.window_size,)).astype(np.int), self.right_inds_)) - # print(self.decision_scores_.shape, self.left_inds_.shape, self.right_inds_.shape) - - self._process_decision_scores() - return self - - def decision_function(self, X: np.array): - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['models_']) - X = check_array(X).astype(np.float) - assert (X.shape[1] == self.n_models_) - n_samples = len(X) - - # need to subtract 1 because need to have y for subtraction - valid_len = get_sub_sequences_length(n_samples, self.window_size, - self.step_size) - 1 - - # combine the scores from all dimensions - decison_mat = np.zeros([valid_len, self.n_models_]) - for i in range(self.n_models_): - decison_mat[:, i], X_left_inds, X_right_inds = \ - self.models_[i].decision_function(X[:, i].reshape(-1, 1)) - - # scale the decision mat - decison_mat_scaled = self._score_scalar.transform(decison_mat) - decision_scores = self._score_combination(decison_mat_scaled) - - # print(decision_scores.shape, X_left_inds.shape, X_right_inds.shape) - decision_scores = np.concatenate((np.zeros((self.window_size,)), decision_scores)) - X_left_inds = np.concatenate(((-self.window_size)*np.ones((self.window_size,)).astype(np.int), X_left_inds)) - X_right_inds = np.concatenate((np.zeros((self.window_size,)).astype(np.int), X_right_inds)) - # print(decision_scores.shape, X_left_inds.shape, X_right_inds.shape) - - return decision_scores, X_left_inds, X_right_inds - - -if __name__ == "__main__": # pragma: no cover - X_train = np.asarray( - [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], - [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) - - X_test = np.asarray( - [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], - [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) - - # X_test = np.asarray( - # [[12., 10], [8., 12], [80., 80], [92., 983], - # [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], - # [22., 15]]) - - clf = MultiAutoRegOD(window_size=3, step_size=1, contamination=0.2) - - clf.fit(X_train) - decision_scores, left_inds_, right_inds = clf.decision_scores_, \ - clf.left_inds_, clf.right_inds_ - print(clf.left_inds_, clf.right_inds_) - pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) - pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) - pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) - - print(pred_scores) - print(pred_labels) - print(pred_probs) diff --git a/tods/detection_algorithm/core/PCA.py b/tods/detection_algorithm/core/PCA.py deleted file mode 100644 index 502beb32..00000000 --- a/tods/detection_algorithm/core/PCA.py +++ /dev/null @@ -1,264 +0,0 @@ -# -*- coding: utf-8 -*- -"""Autoregressive model for multivariate time series outlier detection. -""" -import numpy as np -from sklearn.utils import check_array -from sklearn.utils.validation import check_is_fitted - -from .CollectiveBase import CollectiveBaseDetector -from pyod.models.pca import PCA as PCA_PYOD - -from .utility import get_sub_matrices - - -class PCA(CollectiveBaseDetector): - """PCA-based outlier detection with both univariate and multivariate - time series data. TS data will be first transformed to tabular format. - For univariate data, it will be in shape of [valid_length, window_size]. - for multivariate data with d sequences, it will be in the shape of - [valid_length, window_size]. - - Parameters - ---------- - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - contamination : float in (0., 0.5), optional (default=0.1) - The amount of contamination of the data set, - i.e. the proportion of outliers in the data set. Used when fitting to - define the threshold on the decision function. - - n_components : int, float, None or string - Number of components to keep. It should be smaller than the window_size. - if n_components is not set all components are kept:: - - n_components == min(n_samples, n_features) - - if n_components == 'mle' and svd_solver == 'full', Minka\'s MLE is used - to guess the dimension - if ``0 < n_components < 1`` and svd_solver == 'full', select the number - of components such that the amount of variance that needs to be - explained is greater than the percentage specified by n_components - n_components cannot be equal to n_features for svd_solver == 'arpack'. - - n_selected_components : int, optional (default=None) - Number of selected principal components - for calculating the outlier scores. It is not necessarily equal to - the total number of the principal components. If not set, use - all principal components. - - copy : bool (default True) - If False, data passed to fit are overwritten and running - fit(X).transform(X) will not yield the expected results, - use fit_transform(X) instead. - - whiten : bool, optional (default False) - When True (False by default) the `components_` vectors are multiplied - by the square root of n_samples and then divided by the singular values - to ensure uncorrelated outputs with unit component-wise variances. - - Whitening will remove some information from the transformed signal - (the relative variance scales of the components) but can sometime - improve the predictive accuracy of the downstream estimators by - making their data respect some hard-wired assumptions. - - svd_solver : string {'auto', 'full', 'arpack', 'randomized'} - auto : - the solver is selected by a default policy based on `X.shape` and - `n_components`: if the input data is larger than 500x500 and the - number of components to extract is lower than 80% of the smallest - dimension of the data, then the more efficient 'randomized' - method is enabled. Otherwise the exact full SVD is computed and - optionally truncated afterwards. - full : - run exact full SVD calling the standard LAPACK solver via - `scipy.linalg.svd` and select the components by postprocessing - arpack : - run SVD truncated to n_components calling ARPACK solver via - `scipy.sparse.linalg.svds`. It requires strictly - 0 < n_components < X.shape[1] - randomized : - run randomized SVD by the method of Halko et al. - - tol : float >= 0, optional (default .0) - Tolerance for singular values computed by svd_solver == 'arpack'. - - iterated_power : int >= 0, or 'auto', (default 'auto') - Number of iterations for the power method computed by - svd_solver == 'randomized'. - - random_state : int, RandomState instance or None, optional (default None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used - by `np.random`. Used when ``svd_solver`` == 'arpack' or 'randomized'. - - weighted : bool, optional (default=True) - If True, the eigenvalues are used in score computation. - The eigenvectors with small eigenvalues comes with more importance - in outlier score calculation. - - standardization : bool, optional (default=True) - If True, perform standardization first to convert - data to zero mean and unit variance. - See http://scikit-learn.org/stable/auto_examples/preprocessing/plot_scaling_importance.html - - Attributes - ---------- - decision_scores_ : numpy array of shape (n_samples,) - The outlier scores of the training data. - The higher, the more abnormal. Outliers tend to have higher - scores. This value is available once the detector is - fitted. - - threshold_ : float - The threshold is based on ``contamination``. It is the - ``n_samples * contamination`` most abnormal samples in - ``decision_scores_``. The threshold is calculated for generating - binary outlier labels. - - labels_ : int, either 0 or 1 - The binary labels of the training data. 0 stands for inliers - and 1 for outliers/anomalies. It is generated by applying - ``threshold_`` on ``decision_scores_``. - """ - - def __init__(self, window_size, step_size=1, contamination=0.1, - n_components=None, n_selected_components=None, - copy=True, whiten=False, svd_solver='auto', - tol=0.0, iterated_power='auto', random_state=None, - weighted=True, standardization=True): - super(PCA, self).__init__(contamination=contamination) - self.window_size = window_size - self.step_size = step_size - - # parameters for PCA - self.n_components = n_components - self.n_selected_components = n_selected_components - self.copy = copy - self.whiten = whiten - self.svd_solver = svd_solver - self.tol = tol - self.iterated_power = iterated_power - self.random_state = random_state - self.weighted = weighted - self.standardization = standardization - - # initialize a kNN model - self.model_ = PCA_PYOD(n_components=self.n_components, - n_selected_components=self.n_selected_components, - contamination=self.contamination, - copy=self.copy, - whiten=self.whiten, - svd_solver=self.svd_solver, - tol=self.tol, - iterated_power=self.iterated_power, - random_state=self.random_state, - weighted=self.weighted, - standardization=self.standardization) - - def fit(self, X: np.array) -> object: - """Fit detector. y is ignored in unsupervised methods. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. - - y : Ignored - Not used, present for API consistency by convention. - - Returns - ------- - self : object - Fitted estimator. - """ - X = check_array(X).astype(np.float) - - # first convert it into submatrices, and flatten it - sub_matrices, self.left_inds_, self.right_inds_ = get_sub_matrices( - X, - self.window_size, - self.step_size, - return_numpy=True, - flatten=True, - flatten_order='F') - - # if self.n_components > sub_matrices.shape[1]: - # raise ValueError('n_components exceeds window_size times the number of sequences.') - - # fit the PCA model - self.model_.fit(sub_matrices) - self.decision_scores_ = self.model_.decision_scores_ - self._process_decision_scores() - return self - - def decision_function(self, X: np.array): - """Predict raw anomaly scores of X using the fitted detector. - - The anomaly score of an input sample is computed based on the fitted - detector. For consistency, outliers are assigned with - higher anomaly scores. - - Parameters - ---------- - X : numpy array of shape (n_samples, n_features) - The input samples. Sparse matrices are accepted only - if they are supported by the base estimator. - - Returns - ------- - anomaly_scores : numpy array of shape (n_samples,) - The anomaly score of the input samples. - """ - check_is_fitted(self, ['model_']) - X = check_array(X).astype(np.float) - # first convert it into submatrices, and flatten it - sub_matrices, X_left_inds, X_right_inds = get_sub_matrices( - X, - self.window_size, - self.step_size, - return_numpy=True, - flatten=True, - flatten_order='F') - - # return the prediction result by PCA - return self.model_.decision_function( - sub_matrices), X_left_inds.ravel(), X_right_inds.ravel() - - -if __name__ == "__main__": # pragma: no cover - # X_train = np.asarray( - # [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, 100]).reshape(-1, 1) - - # X_test = np.asarray( - # [3., 4., 8.6, 13.4, 22.5, 17, 19.2, 36.1, 127, -23, 59.2]).reshape(-1, - # 1) - - X_train = np.asarray( - [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], - [12., 12], - [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) - - w = get_sub_matrices(X_train, window_size=3, step=2, flatten=False) - X_test = np.asarray( - [[12., 10], [8., 12], [80., 80], [92., 983], - [18., 16], [20., 7], [18., 10], [3., 5], [5., 9], [23., 12], - [22., 15]]) - - clf = PCA(window_size=3, step_size=2, contamination=0.2) - - clf.fit(X_train) - decision_scores, left_inds_, right_inds = clf.decision_scores_, \ - clf.left_inds_, clf.right_inds_ - print(clf.left_inds_, clf.right_inds_) - pred_scores, X_left_inds, X_right_inds = clf.decision_function(X_test) - pred_labels, X_left_inds, X_right_inds = clf.predict(X_test) - pred_probs, X_left_inds, X_right_inds = clf.predict_proba(X_test) - - print(pred_scores) - print(pred_labels) - print(pred_probs) diff --git a/tods/detection_algorithm/core/SODCommonTest.py b/tods/detection_algorithm/core/SODCommonTest.py deleted file mode 100755 index 2658e971..00000000 --- a/tods/detection_algorithm/core/SODCommonTest.py +++ /dev/null @@ -1,154 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import division -from __future__ import print_function - -import os -import sys - -import numpy as np -import unittest -# noinspection PyProtectedMember -from numpy.testing import assert_equal -from numpy.testing import assert_allclose -from numpy.testing import assert_array_less -from numpy.testing import assert_raises - -from unittest import TestCase - -from sklearn.utils.estimator_checks import check_estimator - -from sklearn.metrics import roc_auc_score -from scipy.stats import rankdata - -# temporary solution for relative imports in case pyod is not installed -# if pyod is installed, no need to use the following line -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from pyod.utils.data import generate_data - -_dummy = TestCase('__init__') -assert_greater = _dummy.assertGreater -assert_greater_equal = _dummy.assertGreaterEqual -assert_less = _dummy.assertLess -assert_less_equal = _dummy.assertLessEqual - - -class SODCommonTest: - def __init__(self, - model, - X_train, - y_train, - X_test, - y_test, - roc_floor, - ): - self.clf = model - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - self.roc_floor = roc_floor - - self.clf.fit(X=self.X_train, y=self.y_train) - - pass - - def test_detector(self): - - self.test_parameters() - self.test_train_scores() - self.test_prediction_scores() - self.test_prediction_proba() - #self.test_prediction_proba_linear() - #self.test_prediction_proba_unify() - #self.test_prediction_proba_parameter() - # self.test_fit_predict() - # self.test_fit_predict_score() - self.test_prediction_labels() - # self.test_predict_rank() - # self.test_predict_rank_normalized() - self.tearDown() - - def test_parameters(self): - assert (hasattr(self.clf, 'decision_scores_') and - self.clf.decision_scores_ is not None) - assert (hasattr(self.clf, 'labels_') and - self.clf.labels_ is not None) - #assert (hasattr(self.clf, 'threshold_') and - # self.clf.threshold_ is not None) - #assert (hasattr(self.clf, '_mu') and - # self.clf._mu is not None) - #assert (hasattr(self.clf, '_sigma') and - # self.clf._sigma is not None) - - def test_train_scores(self): - assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) - - def test_prediction_scores(self): - pred_scores = self.clf.decision_function(self.X_test) - - # check score shapes - assert_equal(pred_scores.shape[0], self.y_test.shape[0]) - - # check performance - assert_greater_equal(roc_auc_score(self.y_test, pred_scores), self.roc_floor) - - def test_prediction_labels(self): - pred_labels = self.clf.predict(self.X_test) - self.y_test = np.squeeze(self.y_test) - assert_equal(pred_labels.shape, self.y_test.shape) - - def test_prediction_proba(self): - pred_proba = self.clf.predict_proba(self.X_test) - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_linear(self): - pred_proba = self.clf.predict_proba(self.X_test, method='linear') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_unify(self): - pred_proba = self.clf.predict_proba(self.X_test, method='unify') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_parameter(self): - with assert_raises(ValueError): - self.clf.predict_proba(self.X_test, method='something') - - def test_fit_predict(self): # pragma: no cover - pred_labels = self.clf.fit_predict(X=self.X_train, y=self.y_train) - assert_equal(pred_labels.shape, self.y_train.shape) - - def test_fit_predict_score(self): # pragma: no cover - self.clf.fit_predict_score(self.X_test, self.y_test) - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='roc_auc_score') - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='prc_n_score') - with assert_raises(NotImplementedError): - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='something') - - def test_predict_rank(self): # pragma: no cover - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, self.X_train.shape[0] + 1) - assert_array_less(-0.1, pred_ranks) - - def test_predict_rank_normalized(self): # pragma: no cover - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, 1.01) - assert_array_less(-0.1, pred_ranks) - - def tearDown(self): - pass diff --git a/tods/detection_algorithm/core/UODCommonTest.py b/tods/detection_algorithm/core/UODCommonTest.py deleted file mode 100755 index 4e9f7c48..00000000 --- a/tods/detection_algorithm/core/UODCommonTest.py +++ /dev/null @@ -1,153 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import division -from __future__ import print_function - -import os -import sys - -import numpy as np -import unittest -# noinspection PyProtectedMember -from numpy.testing import assert_equal -from numpy.testing import assert_allclose -from numpy.testing import assert_array_less -from numpy.testing import assert_raises - -from unittest import TestCase - -from sklearn.utils.estimator_checks import check_estimator - -from sklearn.metrics import roc_auc_score -from scipy.stats import rankdata - -# temporary solution for relative imports in case pyod is not installed -# if pyod is installed, no need to use the following line -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) - -from pyod.utils.data import generate_data - -_dummy = TestCase('__init__') -assert_greater = _dummy.assertGreater -assert_greater_equal = _dummy.assertGreaterEqual -assert_less = _dummy.assertLess -assert_less_equal = _dummy.assertLessEqual - - -class UODCommonTest: - def __init__(self, - model, - X_train, - y_train, - X_test, - y_test, - roc_floor, - ): - self.clf = model - self.X_train = X_train - self.y_train = y_train - self.X_test = X_test - self.y_test = y_test - self.roc_floor = roc_floor - - self.clf.fit(self.X_train) - - pass - - def test_detector(self): - - self.test_parameters() - self.test_train_scores() - self.test_prediction_scores() - self.test_prediction_proba() - self.test_prediction_proba_linear() - self.test_prediction_proba_unify() - self.test_prediction_proba_parameter() - # self.test_fit_predict() - # self.test_fit_predict_score() - self.test_prediction_labels() - # self.test_predict_rank() - # self.test_predict_rank_normalized() - self.tearDown() - - def test_parameters(self): - assert (hasattr(self.clf, 'decision_scores_') and - self.clf.decision_scores_ is not None) - assert (hasattr(self.clf, 'labels_') and - self.clf.labels_ is not None) - assert (hasattr(self.clf, 'threshold_') and - self.clf.threshold_ is not None) - assert (hasattr(self.clf, '_mu') and - self.clf._mu is not None) - assert (hasattr(self.clf, '_sigma') and - self.clf._sigma is not None) - - def test_train_scores(self): - assert_equal(len(self.clf.decision_scores_), self.y_train.shape[0]) - - def test_prediction_scores(self): - pred_scores = self.clf.decision_function(self.X_test) - - # check score shapes - assert_equal(pred_scores.shape[0], self.y_test.shape[0]) - - # check performance - assert_greater_equal(roc_auc_score(self.y_test, pred_scores), self.roc_floor) - - def test_prediction_labels(self): - pred_labels = self.clf.predict(self.X_test) - assert_equal(pred_labels.shape, self.y_test.shape) - - def test_prediction_proba(self): - pred_proba = self.clf.predict_proba(self.X_test) - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_linear(self): - pred_proba = self.clf.predict_proba(self.X_test, method='linear') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_unify(self): - pred_proba = self.clf.predict_proba(self.X_test, method='unify') - assert_greater_equal(pred_proba.min(), 0) - assert_less_equal(pred_proba.max(), 1) - - def test_prediction_proba_parameter(self): - with assert_raises(ValueError): - self.clf.predict_proba(self.X_test, method='something') - - def test_fit_predict(self): # pragma: no cover - pred_labels = self.clf.fit_predict(X=self.X_train) - assert_equal(pred_labels.shape, self.y_train.shape) - - def test_fit_predict_score(self): # pragma: no cover - self.clf.fit_predict_score(self.X_test, self.y_test) - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='roc_auc_score') - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='prc_n_score') - with assert_raises(NotImplementedError): - self.clf.fit_predict_score(self.X_test, self.y_test, - scoring='something') - - def test_predict_rank(self): # pragma: no cover - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, self.X_train.shape[0] + 1) - assert_array_less(-0.1, pred_ranks) - - def test_predict_rank_normalized(self): # pragma: no cover - pred_socres = self.clf.decision_function(self.X_test) - pred_ranks = self.clf._predict_rank(self.X_test, normalized=True) - - # assert the order is reserved - assert_allclose(rankdata(pred_ranks), rankdata(pred_socres), atol=2) - assert_array_less(pred_ranks, 1.01) - assert_array_less(-0.1, pred_ranks) - - def tearDown(self): - pass diff --git a/tods/detection_algorithm/core/__init__.py b/tods/detection_algorithm/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tods/detection_algorithm/core/algorithm_implementation.py b/tods/detection_algorithm/core/algorithm_implementation.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tods/detection_algorithm/core/dagmm/__init__.py b/tods/detection_algorithm/core/dagmm/__init__.py deleted file mode 100644 index cc055001..00000000 --- a/tods/detection_algorithm/core/dagmm/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -# -*- coding: utf-8 -*- - -from .compression_net import CompressionNet -from .estimation_net import EstimationNet -from .gmm import GMM -from .dagmm import DAGMM diff --git a/tods/detection_algorithm/core/dagmm/compression_net.py b/tods/detection_algorithm/core/dagmm/compression_net.py deleted file mode 100644 index 759dbdc0..00000000 --- a/tods/detection_algorithm/core/dagmm/compression_net.py +++ /dev/null @@ -1,121 +0,0 @@ -import tensorflow as tf -import tensorflow.compat.v1 as tf -tf.disable_v2_behavior() - -class CompressionNet: - """ Compression Network. - This network converts the input data to the representations - suitable for calculation of anormaly scores by "Estimation Network". - - Outputs of network consist of next 2 components: - 1) reduced low-dimensional representations learned by AutoEncoder. - 2) the features derived from reconstruction error. - """ - def __init__(self, hidden_layer_sizes, activation=tf.nn.tanh): - """ - Parameters - ---------- - hidden_layer_sizes : list of int - list of the size of hidden layers. - For example, if the sizes are [n1, n2], - the sizes of created networks are: - input_size -> n1 -> n2 -> n1 -> input_sizes - (network outputs the representation of "n2" layer) - activation : function - activation function of hidden layer. - the last layer uses linear function. - """ - self.hidden_layer_sizes = hidden_layer_sizes - self.activation = activation - - def compress(self, x): - self.input_size = x.shape[1] - - with tf.variable_scope("Encoder"): - z = x - n_layer = 0 - for size in self.hidden_layer_sizes[:-1]: - n_layer += 1 - z = tf.layers.dense(z, size, activation=self.activation, - name="layer_{}".format(n_layer)) - - # activation function of last layer is linear - n_layer += 1 - z = tf.layers.dense(z, self.hidden_layer_sizes[-1], - name="layer_{}".format(n_layer)) - - return z - - def reverse(self, z): - with tf.variable_scope("Decoder"): - n_layer = 0 - for size in self.hidden_layer_sizes[:-1][::-1]: - n_layer += 1 - z = tf.layers.dense(z, size, activation=self.activation, - name="layer_{}".format(n_layer)) - - # activation function of last layes is linear - n_layer += 1 - x_dash = tf.layers.dense(z, self.input_size, - name="layer_{}".format(n_layer)) - - return x_dash - - def loss(self, x, x_dash): - def euclid_norm(x): - return tf.sqrt(tf.reduce_sum(tf.square(x), axis=1)) - - # Calculate Euclid norm, distance - norm_x = euclid_norm(x) - norm_x_dash = euclid_norm(x_dash) - dist_x = euclid_norm(x - x_dash) - dot_x = tf.reduce_sum(x * x_dash, axis=1) - - # Based on the original paper, features of reconstraction error - # are composed of these loss functions: - # 1. loss_E : relative Euclidean distance - # 2. loss_C : cosine similarity - min_val = 1e-3 - loss_E = dist_x / (norm_x + min_val) - loss_C = 0.5 * (1.0 - dot_x / (norm_x * norm_x_dash + min_val)) - return tf.concat([loss_E[:,None], loss_C[:,None]], axis=1) - - def extract_feature(self, x, x_dash, z_c): - z_r = self.loss(x, x_dash) - return tf.concat([z_c, z_r], axis=1) - - def inference(self, x): - """ convert input to output tensor, which is composed of - low-dimensional representation and reconstruction error. - - Parameters - ---------- - x : tf.Tensor shape : (n_samples, n_features) - Input data - - Results - ------- - z : tf.Tensor shape : (n_samples, n2 + 2) - Result data - Second dimension of this data is equal to - sum of compressed representation size and - number of loss function (=2) - - x_dash : tf.Tensor shape : (n_samples, n_features) - Reconstructed data for calculation of - reconstruction error. - """ - - with tf.variable_scope("CompNet"): - # AutoEncoder - z_c = self.compress(x) - x_dash = self.reverse(z_c) - - # compose feature vector - z = self.extract_feature(x, x_dash, z_c) - - return z, x_dash - - def reconstruction_error(self, x, x_dash): - return tf.reduce_mean(tf.reduce_sum( - tf.square(x - x_dash), axis=1), axis=0) diff --git a/tods/detection_algorithm/core/dagmm/dagmm.py b/tods/detection_algorithm/core/dagmm/dagmm.py deleted file mode 100644 index 02f882f1..00000000 --- a/tods/detection_algorithm/core/dagmm/dagmm.py +++ /dev/null @@ -1,251 +0,0 @@ -import tensorflow as tf -import numpy as np -from sklearn.preprocessing import StandardScaler -import joblib - -from .compression_net import CompressionNet -from .estimation_net import EstimationNet -from .gmm import GMM -from pyod.utils.stat_models import pairwise_distances_no_broadcast - -from os import makedirs -from os.path import exists, join -import tensorflow.compat.v1 as tf -tf.disable_v2_behavior() - -from pyod.models.base import BaseDetector - -class DAGMM(BaseDetector): - """ Deep Autoencoding Gaussian Mixture Model. - - This implementation is based on the paper: - Bo Zong+ (2018) Deep Autoencoding Gaussian Mixture Model - for Unsupervised Anomaly Detection, ICLR 2018 - (this is UNOFFICIAL implementation) - """ - - MODEL_FILENAME = "DAGMM_model" - SCALER_FILENAME = "DAGMM_scaler" - - def __init__(self, comp_hiddens:list = [16,8,1], - est_hiddens:list = [8,4], est_dropout_ratio:float =0.5, - minibatch_size:int = 1024, epoch_size:int =100, - learning_rate:float =0.0001, lambda1:float =0.1, lambda2:float =0.0001, - normalize:bool=True, random_seed:int=123 , contamination:float = 0.001 ): - """ - Parameters - ---------- - comp_hiddens : list of int - sizes of hidden layers of compression network - For example, if the sizes are [n1, n2], - structure of compression network is: - input_size -> n1 -> n2 -> n1 -> input_sizes - - est_hiddens : list of int - sizes of hidden layers of estimation network. - The last element of this list is assigned as n_comp. - For example, if the sizes are [n1, n2], - structure of estimation network is: - input_size -> n1 -> n2 (= n_comp) - - est_dropout_ratio : float (optional) - dropout ratio of estimation network applied during training - if 0 or None, dropout is not applied. - minibatch_size: int (optional) - mini batch size during training - epoch_size : int (optional) - epoch size during training - learning_rate : float (optional) - learning rate during training - lambda1 : float (optional) - a parameter of loss function (for energy term) - lambda2 : float (optional) - a parameter of loss function - (for sum of diagonal elements of covariance) - normalize : bool (optional) - specify whether input data need to be normalized. - by default, input data is normalized. - random_seed : int (optional) - random seed used when fit() is called. - """ - est_activation = tf.nn.tanh - comp_activation = tf.nn.tanh - super(DAGMM, self).__init__(contamination=contamination) - self.comp_net = CompressionNet(comp_hiddens, comp_activation) - self.est_net = EstimationNet(est_hiddens, est_activation) - self.est_dropout_ratio = est_dropout_ratio - - n_comp = est_hiddens[-1] - self.gmm = GMM(n_comp) - - self.minibatch_size = minibatch_size - self.epoch_size = epoch_size - self.learning_rate = learning_rate - self.lambda1 = lambda1 - self.lambda2 = lambda2 - - self.normalize = normalize - self.scaler = None - self.seed = random_seed - - self.graph = None - self.sess = None - - #def __del__(self): - # if self.sess is not None: - # self.sess.close() - - def fit(self,X,y=None): - """ Fit the DAGMM model according to the given data. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Training data. - """ - - n_samples, n_features = X.shape - - if self.normalize: - self.scaler = scaler = StandardScaler() - X = scaler.fit_transform(X) - - with tf.Graph().as_default() as graph: - self.graph = graph - tf.set_random_seed(self.seed) - np.random.seed(seed=self.seed) - - # Create Placeholder - self.input = input = tf.placeholder( - dtype=tf.float32, shape=[None, n_features]) - self.drop = drop = tf.placeholder(dtype=tf.float32, shape=[]) - - # Build graph - z, x_dash = self.comp_net.inference(input) - gamma = self.est_net.inference(z, drop) - self.gmm.fit(z, gamma) - energy = self.gmm.energy(z) - - self.x_dash = x_dash - - # Loss function - loss = (self.comp_net.reconstruction_error(input, x_dash) + - self.lambda1 * tf.reduce_mean(energy) + - self.lambda2 * self.gmm.cov_diag_loss()) - - # Minimizer - minimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss) - - # Number of batch - n_batch = (n_samples - 1) // self.minibatch_size + 1 - - # Create tensorflow session and initilize - init = tf.global_variables_initializer() - - self.sess = tf.Session(graph=graph) - self.sess.run(init) - - # Training - idx = np.arange(X.shape[0]) - np.random.shuffle(idx) - - for epoch in range(self.epoch_size): - for batch in range(n_batch): - i_start = batch * self.minibatch_size - i_end = (batch + 1) * self.minibatch_size - x_batch = X[idx[i_start:i_end]] - - self.sess.run(minimizer, feed_dict={ - input:x_batch, drop:self.est_dropout_ratio}) - if (epoch + 1) % 10 == 0: - loss_val = self.sess.run(loss, feed_dict={input:X, drop:0}) - print(" epoch {}/{} : loss = {:.3f}".format(epoch + 1, self.epoch_size, loss_val)) - - # Fix GMM parameter - fix = self.gmm.fix_op() - self.sess.run(fix, feed_dict={input:X, drop:0}) - self.energy = self.gmm.energy(z) - - tf.add_to_collection("save", self.input) - tf.add_to_collection("save", self.energy) - - self.saver = tf.train.Saver() - - pred_scores = self.decision_function(X) - self.decision_scores_ = pred_scores - self._process_decision_scores() - #return self - - def decision_function(self, X): - """ Calculate anormaly scores (sample energy) on samples in X. - - Parameters - ---------- - X : array-like, shape (n_samples, n_features) - Data for which anomaly scores are calculated. - n_features must be equal to n_features of the fitted data. - - Returns - ------- - energies : array-like, shape (n_samples) - Calculated sample energies. - """ - if self.sess is None: - raise Exception("Trained model does not exist.") - - if self.normalize: - X = self.scaler.transform(X) - - energies = self.sess.run(self.energy, feed_dict={self.input:X}) - - return energies.reshape(1,-1) - - def save(self, fdir): - """ Save trained model to designated directory. - This method have to be called after training. - (If not, throw an exception) - - Parameters - ---------- - fdir : str - Path of directory trained model is saved. - If not exists, it is created automatically. - """ - if self.sess is None: - raise Exception("Trained model does not exist.") - - if not exists(fdir): - makedirs(fdir) - - model_path = join(fdir, self.MODEL_FILENAME) - self.saver.save(self.sess, model_path) - - if self.normalize: - scaler_path = join(fdir, self.SCALER_FILENAME) - joblib.dump(self.scaler, scaler_path) - - def restore(self, fdir): - """ Restore trained model from designated directory. - - Parameters - ---------- - fdir : str - Path of directory trained model is saved. - """ - if not exists(fdir): - raise Exception("Model directory does not exist.") - - model_path = join(fdir, self.MODEL_FILENAME) - meta_path = model_path + ".meta" - - with tf.Graph().as_default() as graph: - self.graph = graph - self.sess = tf.Session(graph=graph) - self.saver = tf.train.import_meta_graph(meta_path) - self.saver.restore(self.sess, model_path) - - self.input, self.energy = tf.get_collection("save") - - if self.normalize: - scaler_path = join(fdir, self.SCALER_FILENAME) - self.scaler = joblib.load(scaler_path) diff --git a/tods/detection_algorithm/core/dagmm/estimation_net.py b/tods/detection_algorithm/core/dagmm/estimation_net.py deleted file mode 100644 index 6dbe7cc6..00000000 --- a/tods/detection_algorithm/core/dagmm/estimation_net.py +++ /dev/null @@ -1,63 +0,0 @@ -# -*- coding: utf-8 -*- -import tensorflow as tf -import tensorflow.compat.v1 as tf -tf.disable_v2_behavior() - -class EstimationNet: - """ Estimation Network - - This network converts input feature vector to softmax probability. - Bacause loss function for this network is not defined, - it should be implemented outside of this class. - """ - def __init__(self, hidden_layer_sizes, activation=tf.nn.relu): - """ - Parameters - ---------- - hidden_layer_sizes : list of int - list of sizes of hidden layers. - For example, if the sizes are [n1, n2], - layer sizes of the network are: - input_size -> n1 -> n2 - (network outputs the softmax probabilities of "n2" layer) - activation : function - activation function of hidden layer. - the funtcion of last layer is softmax function. - """ - self.hidden_layer_sizes = hidden_layer_sizes - self.activation = activation - - def inference(self, z, dropout_ratio=None): - """ Output softmax probabilities - - Parameters - ---------- - z : tf.Tensor shape : (n_samples, n_features) - Data inferenced by this network - dropout_ratio : tf.Tensor shape : 0-dimension float (optional) - Specify dropout ratio - (if None, dropout is not applied) - - Results - ------- - probs : tf.Tensor shape : (n_samples, n_classes) - Calculated probabilities - """ - with tf.variable_scope("EstNet"): - n_layer = 0 - for size in self.hidden_layer_sizes[:-1]: - n_layer += 1 - z = tf.layers.dense(z, size, activation=self.activation, - name="layer_{}".format(n_layer)) - if dropout_ratio is not None: - z = tf.layers.dropout(z, dropout_ratio, - name="drop_{}".format(n_layer)) - - # Last layer uses linear function (=logits) - size = self.hidden_layer_sizes[-1] - logits = tf.layers.dense(z, size, activation=None, name="logits") - - # Softmax output - output = tf.nn.softmax(logits) - - return output diff --git a/tods/detection_algorithm/core/dagmm/gmm.py b/tods/detection_algorithm/core/dagmm/gmm.py deleted file mode 100644 index 6da40700..00000000 --- a/tods/detection_algorithm/core/dagmm/gmm.py +++ /dev/null @@ -1,130 +0,0 @@ -# -*- coding: utf-8 -*- -import numpy as np -import tensorflow as tf -import tensorflow.compat.v1 as tf -tf.disable_v2_behavior() -class GMM: - """ Gaussian Mixture Model (GMM) """ - def __init__(self, n_comp): - self.n_comp = n_comp - self.phi = self.mu = self.sigma = None - self.training = False - - def create_variables(self, n_features): - with tf.variable_scope("GMM"): - phi = tf.Variable(tf.zeros(shape=[self.n_comp]), - dtype=tf.float32, name="phi") - mu = tf.Variable(tf.zeros(shape=[self.n_comp, n_features]), - dtype=tf.float32, name="mu") - sigma = tf.Variable(tf.zeros( - shape=[self.n_comp, n_features, n_features]), - dtype=tf.float32, name="sigma") - L = tf.Variable(tf.zeros( - shape=[self.n_comp, n_features, n_features]), - dtype=tf.float32, name="L") - - return phi, mu, sigma, L - - def fit(self, z, gamma): - """ fit data to GMM model - - Parameters - ---------- - z : tf.Tensor, shape (n_samples, n_features) - data fitted to GMM. - gamma : tf.Tensor, shape (n_samples, n_comp) - probability. each row is correspond to row of z. - """ - - with tf.variable_scope("GMM"): - # Calculate mu, sigma - # i : index of samples - # k : index of components - # l,m : index of features - gamma_sum = tf.reduce_sum(gamma, axis=0) - self.phi = phi = tf.reduce_mean(gamma, axis=0) - self.mu = mu = tf.einsum('ik,il->kl', gamma, z) / gamma_sum[:,None] - z_centered = tf.sqrt(gamma[:,:,None]) * (z[:,None,:] - mu[None,:,:]) - self.sigma = sigma = tf.einsum( - 'ikl,ikm->klm', z_centered, z_centered) / gamma_sum[:,None,None] - - # Calculate a cholesky decomposition of covariance in advance - n_features = z.shape[1] - min_vals = tf.diag(tf.ones(n_features, dtype=tf.float32)) * 1e-6 - self.L = tf.cholesky(sigma + min_vals[None,:,:]) - - self.training = False - return self - - def fix_op(self): - """ return operator to fix paramters of GMM - Using this operator outside of this class, - you can fix current parameter to static tensor variable. - - After you call this method, you have to run result - operator immediatelly, and call energy() to use static - variables of model parameter. - - Returns - ------- - op : operator of tensorflow - operator to assign current parameter to variables - """ - - phi, mu, sigma, L = self.create_variables(self.mu.shape[1]) - - op = tf.group( - tf.assign(phi, self.phi), - tf.assign(mu, self.mu), - tf.assign(sigma, self.sigma), - tf.assign(L, self.L) - ) - - self.phi, self.phi_org = phi, self.phi - self.mu, self.mu_org = mu, self.mu - self.sigma, self.sigma_org = sigma, self.sigma - self.L, self.L_org = L, self.L - - self.training = False - - return op - - def energy(self, z): - """ calculate an energy of each row of z - - Parameters - ---------- - z : tf.Tensor, shape (n_samples, n_features) - data each row of which is calculated its energy. - - Returns - ------- - energy : tf.Tensor, shape (n_samples) - calculated energies - """ - - if self.training and self.phi is None: - self.phi, self.mu, self.sigma, self.L = self.create_variable(z.shape[1]) - - with tf.variable_scope("GMM_energy"): - # Instead of inverse covariance matrix, exploit cholesky decomposition - # for stability of calculation. - z_centered = z[:,None,:] - self.mu[None,:,:] #ikl - v = tf.matrix_triangular_solve(self.L, tf.transpose(z_centered, [1, 2, 0])) # kli - - # log(det(Sigma)) = 2 * sum[log(diag(L))] - log_det_sigma = 2.0 * tf.reduce_sum(tf.log(tf.matrix_diag_part(self.L)), axis=1) - - # To calculate energies, use "log-sum-exp" (different from orginal paper) - d = z.get_shape().as_list()[1] - logits = tf.log(self.phi[:,None]) - 0.5 * (tf.reduce_sum(tf.square(v), axis=1) - + d * tf.log(2.0 * np.pi) + log_det_sigma[:,None]) - energies = - tf.reduce_logsumexp(logits, axis=0) - - return energies - - def cov_diag_loss(self): - with tf.variable_scope("GMM_diag_loss"): - diag_loss = tf.reduce_sum(tf.divide(1, tf.matrix_diag_part(self.sigma))) - - return diag_loss diff --git a/tods/detection_algorithm/core/test_CollectiveBase.py b/tods/detection_algorithm/core/test_CollectiveBase.py deleted file mode 100644 index 92e6ff92..00000000 --- a/tods/detection_algorithm/core/test_CollectiveBase.py +++ /dev/null @@ -1,211 +0,0 @@ -# -*- coding: utf-8 -*- -from __future__ import division # pragma: no cover -from __future__ import print_function # pragma: no cover - -import os # pragma: no cover -import sys # pragma: no cover - -import unittest # pragma: no cover -from sklearn.utils.testing import assert_equal # pragma: no cover -from sklearn.utils.testing import assert_raises # pragma: no cover - -import numpy as np # pragma: no cover - -# temporary solution for relative imports in case pyod is not installed -# if pyod is installed, no need to use the following line -sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) # pragma: no cover - -from detection_algorithm.core.CollectiveBase import CollectiveBaseDetector # pragma: no cover -from pyod.utils.data import generate_data # pragma: no cover - - -# Check sklearn\tests\test_base -# A few test classes -# noinspection PyMissingConstructor,PyPep8Naming -class MyEstimator(CollectiveBaseDetector): # pragma: no cover - - def __init__(self, l1=0, empty=None): # pragma: no cover - self.l1 = l1 - self.empty = empty - - def fit(self, X, y=None): # pragma: no cover - pass - - def decision_function(self, X): # pragma: no cover - pass - - -# noinspection PyMissingConstructor -class K(CollectiveBaseDetector): # pragma: no cover - def __init__(self, c=None, d=None): # pragma: no cover - self.c = c - self.d = d - - def fit(self, X, y=None): # pragma: no cover - pass - - def decision_function(self, X): # pragma: no cover - pass - - -# noinspection PyMissingConstructor -class T(CollectiveBaseDetector): # pragma: no cover - def __init__(self, a=None, b=None): # pragma: no cover - self.a = a - self.b = b - - def fit(self, X, y=None): # pragma: no cover - pass - - def decision_function(self, X): # pragma: no cover - pass - - -# noinspection PyMissingConstructor -class ModifyInitParams(CollectiveBaseDetector): # pragma: no cover - """Deprecated behavior. - Equal parameters but with a type cast. - Doesn't fulfill a is a - """ - - def __init__(self, a=np.array([0])): # pragma: no cover - self.a = a.copy() - - def fit(self, X, y=None): # pragma: no cover - pass - - def decision_function(self, X): # pragma: no cover - pass - - -# noinspection PyMissingConstructor -class VargEstimator(CollectiveBaseDetector): # pragma: no cover - """scikit-learn estimators shouldn't have vargs.""" - - def __init__(self, *vargs): # pragma: no cover - pass - - def fit(self, X, y=None): # pragma: no cover - pass - - def decision_function(self, X): # pragma: no cover - pass - - -class Dummy1(CollectiveBaseDetector): # pragma: no cover - def __init__(self, contamination=0.1): # pragma: no cover - super(Dummy1, self).__init__(contamination=contamination) - - def decision_function(self, X): # pragma: no cover - pass - - def fit(self, X, y=None): # pragma: no cover - pass - - -class Dummy2(CollectiveBaseDetector): # pragma: no cover - def __init__(self, contamination=0.1): # pragma: no cover - super(Dummy2, self).__init__(contamination=contamination) - - def decision_function(self, X): # pragma: no cover - pass - - def fit(self, X, y=None): # pragma: no cover - return X - - -class Dummy3(CollectiveBaseDetector): # pragma: no cover - def __init__(self, contamination=0.1): # pragma: no cover - super(Dummy3, self).__init__(contamination=contamination) - - def decision_function(self, X): # pragma: no cover - pass - - def fit(self, X, y=None): # pragma: no cover - self.labels_ = X - - -class TestBASE(unittest.TestCase): # pragma: no cover - def setUp(self): # pragma: no cover - self.n_train = 100 - self.n_test = 50 - self.contamination = 0.1 - self.roc_floor = 0.6 - self.X_train, self.y_train, self.X_test, self.y_test = generate_data( - n_train=self.n_train, n_test=self.n_test, - contamination=self.contamination) - - def test_init(self): # pragma: no cover - """ - Test base class initialization - - :return: - """ - self.dummy_clf = Dummy1() - assert_equal(self.dummy_clf.contamination, 0.1) - - self.dummy_clf = Dummy1(contamination=0.2) - assert_equal(self.dummy_clf.contamination, 0.2) - - with assert_raises(ValueError): - Dummy1(contamination=0.51) - - with assert_raises(ValueError): - Dummy1(contamination=0) - - with assert_raises(ValueError): - Dummy1(contamination=-0.5) - - def test_fit(self): # pragma: no cover - self.dummy_clf = Dummy2() - assert_equal(self.dummy_clf.fit(0), 0) - - def test_fit_predict(self): # pragma: no cover - # TODO: add more testcases - - self.dummy_clf = Dummy3() - - assert_equal(self.dummy_clf.fit_predict(0), 0) - - def test_predict_proba(self): # pragma: no cover - # TODO: create uniform testcases - pass - - def test_rank(self): # pragma: no cover - # TODO: create uniform testcases - pass - - def test_repr(self): # pragma: no cover - # Smoke test the repr of the base estimator. - my_estimator = MyEstimator() - repr(my_estimator) - test = T(K(), K()) - assert_equal( - repr(test), - "T(a=K(c=None, d=None), b=K(c=None, d=None))" - ) - - some_est = T(a=["long_params"] * 1000) - assert_equal(len(repr(some_est)), 415) - - def test_str(self): # pragma: no cover - # Smoke test the str of the base estimator - my_estimator = MyEstimator() - str(my_estimator) - - def test_get_params(self): # pragma: no cover - test = T(K(), K()) - - assert ('a__d' in test.get_params(deep=True)) - assert ('a__d' not in test.get_params(deep=False)) - - test.set_params(a__d=2) - assert (test.a.d == 2) - assert_raises(ValueError, test.set_params, a__a=2) - - def tearDown(self): # pragma: no cover - pass - - -if __name__ == '__main__': # pragma: no cover - unittest.main() diff --git a/tods/detection_algorithm/core/utility.py b/tods/detection_algorithm/core/utility.py deleted file mode 100644 index a486b995..00000000 --- a/tods/detection_algorithm/core/utility.py +++ /dev/null @@ -1,179 +0,0 @@ -# -*- coding: utf-8 -*- -"""Utility functions for supporting time-series based outlier detection. -""" - -import numpy as np -from sklearn.utils import check_array - - -# def get_sub_sequences(X, window_size, step=1): -# """Chop a univariate time series into sub sequences. - -# Parameters -# ---------- -# X : numpy array of shape (n_samples,) -# The input samples. - -# window_size : int -# The moving window size. - -# step_size : int, optional (default=1) -# The displacement for moving window. - -# Returns -# ------- -# X_sub : numpy array of shape (valid_len, window_size) -# The numpy matrix with each row stands for a subsequence. -# """ -# X = check_array(X).astype(np.float) -# n_samples = len(X) - -# # get the valid length -# valid_len = get_sub_sequences_length(n_samples, window_size, step) - -# X_sub = np.zeros([valid_len, window_size]) -# # y_sub = np.zeros([valid_len, 1]) - -# # exclude the edge -# steps = list(range(0, n_samples, step)) -# steps = steps[:valid_len] - -# for idx, i in enumerate(steps): -# X_sub[idx,] = X[i: i + window_size].ravel() - -# return X_sub - -def get_sub_matrices(X, window_size, step=1, return_numpy=True, flatten=True, - flatten_order='F'): - """Chop a multivariate time series into sub sequences (matrices). - - Parameters - ---------- - X : numpy array of shape (n_samples,) - The input samples. - - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - return_numpy : bool, optional (default=True) - If True, return the data format in 3d numpy array. - - flatten : bool, optional (default=True) - If True, flatten the returned array in 2d. - - flatten_order : str, optional (default='F') - Decide the order of the flatten for multivarite sequences. - ‘C’ means to flatten in row-major (C-style) order. - ‘F’ means to flatten in column-major (Fortran- style) order. - ‘A’ means to flatten in column-major order if a is Fortran contiguous in memory, - row-major order otherwise. ‘K’ means to flatten a in the order the elements occur in memory. - The default is ‘F’. - - Returns - ------- - X_sub : numpy array of shape (valid_len, window_size*n_sequences) - The numpy matrix with each row stands for a flattend submatrix. - """ - X = check_array(X).astype(np.float) - n_samples, n_sequences = X.shape[0], X.shape[1] - - # get the valid length - valid_len = get_sub_sequences_length(n_samples, window_size, step) - - X_sub = [] - X_left_inds = [] - X_right_inds = [] - - # exclude the edge - steps = list(range(0, n_samples, step)) - steps = steps[:valid_len] - - # print(n_samples, n_sequences) - for idx, i in enumerate(steps): - X_sub.append(X[i: i + window_size, :]) - X_left_inds.append(i) - X_right_inds.append(i + window_size) - - X_sub = np.asarray(X_sub) - - if return_numpy: - if flatten: - temp_array = np.zeros([valid_len, window_size * n_sequences]) - if flatten_order == 'C': - for i in range(valid_len): - temp_array[i, :] = X_sub[i, :, :].flatten(order='C') - - else: - for i in range(valid_len): - temp_array[i, :] = X_sub[i, :, :].flatten(order='F') - return temp_array, np.asarray(X_left_inds), np.asarray( - X_right_inds) - - else: - return np.asarray(X_sub), np.asarray(X_left_inds), np.asarray( - X_right_inds) - else: - return X_sub, np.asarray(X_left_inds), np.asarray(X_right_inds) - - -def get_sub_sequences_length(n_samples, window_size, step): - """Pseudo chop a univariate time series into sub sequences. Return valid - length only. - - Parameters - ---------- - X : numpy array of shape (n_samples,) - The input samples. - - window_size : int - The moving window size. - - step_size : int, optional (default=1) - The displacement for moving window. - - Returns - ------- - valid_len : int - The number of subsequences. - - """ - # if X.shape[0] == 1: - # n_samples = X.shape[1] - # elif X.shape[1] == 1: - # n_samples = X.shape[0] - # else: - # raise ValueError("X is not a univarite series. The shape is {shape}.".format(shape=X.shape)) - - # valid_len = n_samples - window_size + 1 - # valida_len = int_down(n_samples-window_size)/step + 1 - valid_len = int(np.floor((n_samples - window_size) / step)) + 1 - return valid_len - - -if __name__ == "__main__": - X_train = np.asarray( - [3., 4., 8., 16, 18, 13., 22., 36., 59., 128, 62, 67, 78, - 100]).reshape(-1, 1) - - X_train = np.asarray( - [[3., 5], [5., 9], [7., 2], [42., 20], [8., 12], [10., 12], [12., 12], - [18., 16], [20., 7], [18., 10], [23., 12], [22., 15]]) - - # n_samples = X.shape[0] - - window_size = 3 - - # valid_len = n_samples - window_size + 1 - - # X_sub = np.zeros([valid_len, window_size]) - - # for i in range(valid_len): - # X_sub[i, ] = X[i: i+window_size] - - # X_sub_2 = get_sub_sequences(X, window_size, step=2) - X_sub_3, X_left_inds, X_right_inds = get_sub_matrices(X_train, window_size, - step=2, - flatten_order='C') diff --git a/tods/detection_algorithm/core/utils/__init__.py b/tods/detection_algorithm/core/utils/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tods/detection_algorithm/core/utils/channel.py b/tods/detection_algorithm/core/utils/channel.py deleted file mode 100644 index 197bee44..00000000 --- a/tods/detection_algorithm/core/utils/channel.py +++ /dev/null @@ -1,114 +0,0 @@ -import numpy as np -import os -import logging - -logger = logging.getLogger('telemanom') - - -class Channel: - def __init__(self,n_predictions,l_s): - # , config, chan_id): - """ - Load and reshape channel values (predicted and actual). - - Args: - config (obj): Config object containing parameters for processing - chan_id (str): channel id - - Attributes: - id (str): channel id - config (obj): see Args - X_train (arr): training inputs with dimensions - [timesteps, l_s, input dimensions) - X_test (arr): test inputs with dimensions - [timesteps, l_s, input dimensions) - y_train (arr): actual channel training values with dimensions - [timesteps, n_predictions, 1) - y_test (arr): actual channel test values with dimensions - [timesteps, n_predictions, 1) - train (arr): train data loaded from .npy file - test(arr): test data loaded from .npy file - """ - - # self.id = chan_id - # self.config = config - self.X_train = None - self.y_train = None - self.X_test = None - self.y_test = None - self.y_hat = None - self.train = None - self.test = None - - self._n_predictions = n_predictions - self._l_s = l_s - - def shape_train_data(self, arr): - # , train=True): - """Shape raw input streams for ingestion into LSTM. config.l_s specifies - the sequence length of prior timesteps fed into the model at - each timestep t. - - Args: - arr (np array): array of input streams with - dimensions [timesteps, 1, input dimensions] - train (bool): If shaping training data, this indicates - data can be shuffled - """ - # print("in shape data") - # print("arr shape",arr.shape) - # print("ls",self.config.l_s) - # print("n_pred",self.config.n_predictions) - data = [] - - for i in range(len(arr) - self._l_s - self._n_predictions): - data.append(arr[i:i + self._l_s + self._n_predictions]) - data = np.array(data) - # print("data shape",data.shape) - # assert len(data.shape) == 3 - - # if train: - # # np.random.shuffle(data) - # self.X_train = data[:, :-self.config.n_predictions, :] - # self.y_train = data[:, -self.config.n_predictions:, :] # telemetry value is at position 0 - # self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) - # print("X train shape",self.X_train .shape) - # print("Y train shape",self.y_train .shape) - # else: - - self.X_train = data[:, :-self._n_predictions, :] - self.y_train = data[:, -self._n_predictions:, :] # telemetry value is at position 0 - self.y_train = np.reshape(self.y_train,(self.y_train.shape[0],self.y_train.shape[1]*self.y_train.shape[2])) - - - - - def shape_test_data(self, arr): - data = [] - - for i in range(len(arr) - self._l_s - self._n_predictions): - data.append(arr[i:i + self._l_s + self._n_predictions]) - data = np.array(data) - # print("data shape",data.shape) - self.X_test = data[:, :-self._n_predictions, :] - self.y_test = data[:, -self._n_predictions:, :] # telemetry value is at position 0 - self.y_test = np.reshape(self.y_test,(self.y_test.shape[0],self.y_test.shape[1]*self.y_test.shape[2])) - - - # def load_data(self): - # """ - # Load train and test data from local. - # """ - # # try: - # # self.train = np.load(os.path.join("data", "train", "{}.npy".format(self.id))) - # # self.test = np.load(os.path.join("data", "test", "{}.npy".format(self.id))) - - # # except FileNotFoundError as e: - # # # logger.critical(e) - # # # logger.critical("Source data not found, may need to add data to repo: ") - # # print("Source data not found, may need to add data to repo: ") - - # print("before shape function") - # print(self.train.shape) - # self.shape_data(self.train) - # self.shape_data(self.test, train=False) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/errors.py b/tods/detection_algorithm/core/utils/errors.py deleted file mode 100644 index d3ee8ab3..00000000 --- a/tods/detection_algorithm/core/utils/errors.py +++ /dev/null @@ -1,532 +0,0 @@ -import numpy as np -import pandas as pd -import more_itertools as mit -import os -import logging - -logger = logging.getLogger('telemanom') - - -class Errors: - def __init__(self, channel, window_size,batch_size, smoothing_perc,n_predictions,l_s,error_buffer,p): - """ - Batch processing of errors between actual and predicted values - for a channel. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - config (obj): Config object containing parameters for processing - run_id (str): Datetime referencing set of predictions in use - - Attributes: - config (obj): see Args - window_size (int): number of trailing batches to use in error - calculation - n_windows (int): number of windows in test values for channel - i_anom (arr): indices of anomalies in channel test values - E_seq (arr of tuples): array of (start, end) indices for each - continuous anomaly sequence in test values - anom_scores (arr): score indicating relative severity of each - anomaly sequence in E_seq - e (arr): errors in prediction (predicted - actual) - e_s (arr): exponentially-smoothed errors in prediction - normalized (arr): prediction errors as a percentage of the range - of the channel values - """ - - # self.config = config - - - self._window_size =window_size - self._batch_size = batch_size - self._smoothing_perc = smoothing_perc - self._n_predictions = n_predictions - self._l_s = l_s - self._error_buffer = error_buffer - self._p = p - - - self.window_size = self._window_size - self.n_windows = int((channel.y_test.shape[0] - - (self._batch_size * self.window_size)) - / self._batch_size) - self.i_anom = np.array([]) - self.E_seq = [] - self.anom_scores = [] - channel.y_test = np.reshape(channel.y_test,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) - # print("*****************************") - # print("y_hat shape",channel.y_hat.shape) - # print("y_test shape",channel.y_test.shape) - - channel.y_hat = np.reshape(channel.y_hat, (channel.y_hat.shape[0]*channel.y_hat.shape[1]*channel.y_hat.shape[2])) - channel.y_test = np.reshape(channel.y_test, (channel.y_test.shape[0]*channel.y_test.shape[1]*channel.y_test.shape[2])) - - # print("after y_hat shape",channel.y_hat.shape) - # print(" after y_test shape",channel.y_test.shape) - - - # raw prediction error - self.e = [abs(y_h-y_t) for y_h, y_t in - zip(channel.y_hat, channel.y_test)] - - self.e = np.reshape(self.e,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) - # print("raw shape",self.e.shape) - - n_pred = self._n_predictions - n_feature = channel.X_test.shape[2] - - # Aggregation for point wise - # aggregated_error = np.zeros(n_feature*(len(self.e)+n_pred-1)) - # aggregated_error = np.reshape(aggregated_error,((len(self.e)+n_pred-1),n_feature)) - # # print(aggregated_error) - # for i in range(0,len(self.e)): - # for j in range(len(self.e[i])): - # aggregated_error[i+j]+= self.e[i][j] - - # for i in range(1, len(aggregated_error)+1): - # if i < n_pred: - # aggregated_error[i-1] /=i - # elif len(aggregated_error) - i+1 < n_pred: - # aggregated_error[i-1]/= len(aggregated_error) - i+1 - # else: - # aggregated_error[i-1] /=n_pred - - # Aggregation sequence wise - aggregated_error = [] - for i in range(0,len(self.e)): - aggregated_error.append(np.sum(self.e[i],axis=0)) - - aggregated_error = np.asarray(aggregated_error) - # print(aggregated_error.shape) - - smoothing_window = int(self._batch_size * self._window_size - * self._smoothing_perc) - if not len(channel.y_hat) == len(channel.y_test): - raise ValueError('len(y_hat) != len(y_test): {}, {}' - .format(len(channel.y_hat), len(channel.y_test))) - - # smoothed prediction error - self.e_s = pd.DataFrame(aggregated_error).ewm(span=smoothing_window)\ - .mean().values.flatten() - - # print("ES",self.e_s) - # print("ES",self.e_s.shape) - # for values at beginning < sequence length, just use avg - # if not channel.id == 'C-2': # anomaly occurs early in window - - # print("LHS",self.e_s[:self.config.l_s]) - # print("RHS",[np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s) - # b = [np.mean(self.e_s[:self.config.l_s * 2])] * self.config.l_s - # print("RHS shape",len(b)) - # self.e_s[:self._l_s] = [np.mean(self.e_s[:self._l_s * 2])] * self._l_s - - # np.save(os.path.join('data', run_id, 'smoothed_errors', '{}.npy' - # .format(channel.id)), - # np.array(self.e_s)) - - self.normalized = np.mean(self.e / np.ptp(channel.y_test)) - # logger.info("normalized prediction error: {0:.2f}" - # .format(self.normalized)) - - def adjust_window_size(self, channel): # pragma: no cover - """ - Decrease the historical error window size (h) if number of test - values is limited. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - """ - - while self.n_windows < 0: - self.window_size -= 1 - self.n_windows = int((channel.y_test.shape[0] - - (self._batch_size * self.window_size)) - / self._batch_size) - if self.window_size == 1 and self.n_windows < 0: - raise ValueError('Batch_size ({}) larger than y_test (len={}). ' - 'Adjust in config.yaml.' - .format(self._batch_size, - channel.y_test.shape[0])) - - def merge_scores(self): # pragma: no cover - """ - If anomalous sequences from subsequent batches are adjacent they - will automatically be combined. This combines the scores for these - initial adjacent sequences (scores are calculated as each batch is - processed) where applicable. - """ - - merged_scores = [] - score_end_indices = [] - - for i, score in enumerate(self.anom_scores): - if not score['start_idx']-1 in score_end_indices: - merged_scores.append(score['score']) - score_end_indices.append(score['end_idx']) - - def process_batches(self, channel): # pragma: no cover - """ - Top-level function for the Error class that loops through batches - of values for a channel. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - """ - - self.adjust_window_size(channel) - - for i in range(0, self.n_windows+1): - prior_idx = i * self._batch_size - idx = (self._window_size * self._batch_size) \ - + (i * self._batch_size) - if i == self.n_windows: - idx = channel.y_test.shape[0] - - window = ErrorWindow(channel, prior_idx, idx, self, i,self._l_s,self._error_buffer,self._batch_size,self._p) - - window.find_epsilon() - window.find_epsilon(inverse=True) - - window.compare_to_epsilon(self) - window.compare_to_epsilon(self, inverse=True) - - if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: - continue - - window.prune_anoms() - window.prune_anoms(inverse=True) - - if len(window.i_anom) == 0 and len(window.i_anom_inv) == 0: - continue - - window.i_anom = np.sort(np.unique( - np.append(window.i_anom, window.i_anom_inv))).astype('int') - window.score_anomalies(prior_idx) - # print("window anom scores", window.anom_scores) - - # update indices to reflect true indices in full set of values - self.i_anom = np.append(self.i_anom, window.i_anom + prior_idx) - self.anom_scores = self.anom_scores + window.anom_scores - - if len(self.i_anom) > 0: - # group anomalous indices into continuous sequences - groups = [list(group) for group in - mit.consecutive_groups(self.i_anom)] - self.E_seq = [(int(g[0]), int(g[-1])) for g in groups - if not g[0] == g[-1]] - - # additional shift is applied to indices so that they represent the - # position in the original data array, obtained from the .npy files, - # and not the position on y_test (See PR #27). - self.E_seq = [(e_seq[0] + self._l_s, - e_seq[1] + self._l_s) for e_seq in self.E_seq] - - self.merge_scores() - - -class ErrorWindow: # pragma: no cover - def __init__(self, channel,start_idx, end_idx, errors, window_num,l_s,error_buffer,batch_size,p): - """ - Data and calculations for a specific window of prediction errors. - Includes finding thresholds, pruning, and scoring anomalous sequences - for errors and inverted errors (flipped around mean) - significant drops - in values can also be anomalous. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - config (obj): Config object containing parameters for processing - start_idx (int): Starting index for window within full set of - channel test values - end_idx (int): Ending index for window within full set of channel - test values - errors (arr): Errors class object - window_num (int): Current window number within channel test values - - Attributes: - i_anom (arr): indices of anomalies in window - i_anom_inv (arr): indices of anomalies in window of inverted - telemetry values - E_seq (arr of tuples): array of (start, end) indices for each - continuous anomaly sequence in window - E_seq_inv (arr of tuples): array of (start, end) indices for each - continuous anomaly sequence in window of inverted telemetry - values - non_anom_max (float): highest smoothed error value below epsilon - non_anom_max_inv (float): highest smoothed error value below - epsilon_inv - config (obj): see Args - anom_scores (arr): score indicating relative severity of each - anomaly sequence in E_seq within a window - window_num (int): see Args - sd_lim (int): default number of standard deviations to use for - threshold if no winner or too many anomalous ranges when scoring - candidate thresholds - sd_threshold (float): number of standard deviations for calculation - of best anomaly threshold - sd_threshold_inv (float): same as above for inverted channel values - e_s (arr): exponentially-smoothed prediction errors in window - e_s_inv (arr): inverted e_s - sd_e_s (float): standard deviation of e_s - mean_e_s (float): mean of e_s - epsilon (float): threshold for e_s above which an error is - considered anomalous - epsilon_inv (float): threshold for inverted e_s above which an error - is considered anomalous - y_test (arr): Actual telemetry values for window - sd_values (float): st dev of y_test - perc_high (float): the 95th percentile of y_test values - perc_low (float): the 5th percentile of y_test values - inter_range (float): the range between perc_high - perc_low - num_to_ignore (int): number of values to ignore initially when - looking for anomalies - """ - - self._l_s = l_s - self._error_buffer = error_buffer - self._batch_size = batch_size - self._p = p - - - self.i_anom = np.array([]) - self.E_seq = np.array([]) - self.non_anom_max = -1000000 - self.i_anom_inv = np.array([]) - self.E_seq_inv = np.array([]) - self.non_anom_max_inv = -1000000 - - # self.config = config - self.anom_scores = [] - - self.window_num = window_num - - self.sd_lim = 12.0 - self.sd_threshold = self.sd_lim - self.sd_threshold_inv = self.sd_lim - - self.e_s = errors.e_s[start_idx:end_idx] - - self.mean_e_s = np.mean(self.e_s) - self.sd_e_s = np.std(self.e_s) - self.e_s_inv = np.array([self.mean_e_s + (self.mean_e_s - e) - for e in self.e_s]) - - self.epsilon = self.mean_e_s + self.sd_lim * self.sd_e_s - self.epsilon_inv = self.mean_e_s + self.sd_lim * self.sd_e_s - - self.y_test = channel.y_test[start_idx:end_idx] - self.sd_values = np.std(self.y_test) - - self.perc_high, self.perc_low = np.percentile(self.y_test, [95, 5]) - self.inter_range = self.perc_high - self.perc_low - - # ignore initial error values until enough history for processing - self.num_to_ignore = self._l_s * 2 - # if y_test is small, ignore fewer - if len(channel.y_test) < 2500: - self.num_to_ignore = self._l_s - if len(channel.y_test) < 1800: - self.num_to_ignore = 0 - - def find_epsilon(self, inverse=False): - """ - Find the anomaly threshold that maximizes function representing - tradeoff between: - a) number of anomalies and anomalous ranges - b) the reduction in mean and st dev if anomalous points are removed - from errors - (see https://arxiv.org/pdf/1802.04431.pdf) - - Args: - inverse (bool): If true, epsilon is calculated for inverted errors - """ - e_s = self.e_s if not inverse else self.e_s_inv - - max_score = -10000000 - - for z in np.arange(2.5, self.sd_lim, 0.5): - epsilon = self.mean_e_s + (self.sd_e_s * z) - - pruned_e_s = e_s[e_s < epsilon] - - i_anom = np.argwhere(e_s >= epsilon).reshape(-1,) - buffer = np.arange(1, self._error_buffer) - i_anom = np.sort(np.concatenate((i_anom, - np.array([i+buffer for i in i_anom]) - .flatten(), - np.array([i-buffer for i in i_anom]) - .flatten()))) - i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] - i_anom = np.sort(np.unique(i_anom)) - - if len(i_anom) > 0: - # group anomalous indices into continuous sequences - groups = [list(group) for group - in mit.consecutive_groups(i_anom)] - E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] - - mean_perc_decrease = (self.mean_e_s - np.mean(pruned_e_s)) \ - / self.mean_e_s - sd_perc_decrease = (self.sd_e_s - np.std(pruned_e_s)) \ - / self.sd_e_s - score = (mean_perc_decrease + sd_perc_decrease) \ - / (len(E_seq) ** 2 + len(i_anom)) - - # sanity checks / guardrails - if score >= max_score and len(E_seq) <= 5 and \ - len(i_anom) < (len(e_s) * 0.5): - max_score = score - if not inverse: - self.sd_threshold = z - self.epsilon = self.mean_e_s + z * self.sd_e_s - else: - self.sd_threshold_inv = z - self.epsilon_inv = self.mean_e_s + z * self.sd_e_s - - def compare_to_epsilon(self, errors_all, inverse=False): - """ - Compare smoothed error values to epsilon (error threshold) and group - consecutive errors together into sequences. - - Args: - errors_all (obj): Errors class object containing list of all - previously identified anomalies in test set - """ - - e_s = self.e_s if not inverse else self.e_s_inv - epsilon = self.epsilon if not inverse else self.epsilon_inv - - # Check: scale of errors compared to values too small? - if not (self.sd_e_s > (.05 * self.sd_values) or max(self.e_s) - > (.05 * self.inter_range)) or not max(self.e_s) > 0.05: - return - - i_anom = np.argwhere((e_s >= epsilon) & - (e_s > 0.05 * self.inter_range)).reshape(-1,) - - if len(i_anom) == 0: - return - buffer = np.arange(1, self._error_buffer+1) - i_anom = np.sort(np.concatenate((i_anom, - np.array([i + buffer for i in i_anom]) - .flatten(), - np.array([i - buffer for i in i_anom]) - .flatten()))) - i_anom = i_anom[(i_anom < len(e_s)) & (i_anom >= 0)] - - # if it is first window, ignore initial errors (need some history) - if self.window_num == 0: - i_anom = i_anom[i_anom >= self.num_to_ignore] - else: - i_anom = i_anom[i_anom >= len(e_s) - self._batch_size] - - i_anom = np.sort(np.unique(i_anom)) - - # capture max of non-anomalous values below the threshold - # (used in filtering process) - batch_position = self.window_num * self._batch_size - window_indices = np.arange(0, len(e_s)) + batch_position - adj_i_anom = i_anom + batch_position - window_indices = np.setdiff1d(window_indices, - np.append(errors_all.i_anom, adj_i_anom)) - candidate_indices = np.unique(window_indices - batch_position) - non_anom_max = np.max(np.take(e_s, candidate_indices)) - - # group anomalous indices into continuous sequences - groups = [list(group) for group in mit.consecutive_groups(i_anom)] - E_seq = [(g[0], g[-1]) for g in groups if not g[0] == g[-1]] - - if inverse: - self.i_anom_inv = i_anom - self.E_seq_inv = E_seq - self.non_anom_max_inv = non_anom_max - else: - self.i_anom = i_anom - self.E_seq = E_seq - self.non_anom_max = non_anom_max - - def prune_anoms(self, inverse=False): - """ - Remove anomalies that don't meet minimum separation from the next - closest anomaly or error value - - Args: - inverse (bool): If true, epsilon is calculated for inverted errors - """ - - E_seq = self.E_seq if not inverse else self.E_seq_inv - e_s = self.e_s if not inverse else self.e_s_inv - non_anom_max = self.non_anom_max if not inverse \ - else self.non_anom_max_inv - - if len(E_seq) == 0: - return - - E_seq_max = np.array([max(e_s[e[0]:e[1]+1]) for e in E_seq]) - E_seq_max_sorted = np.sort(E_seq_max)[::-1] - E_seq_max_sorted = np.append(E_seq_max_sorted, [non_anom_max]) - - i_to_remove = np.array([]) - for i in range(0, len(E_seq_max_sorted)-1): - if (E_seq_max_sorted[i] - E_seq_max_sorted[i+1]) \ - / E_seq_max_sorted[i] < self._p: - i_to_remove = np.append(i_to_remove, np.argwhere( - E_seq_max == E_seq_max_sorted[i])) - else: - i_to_remove = np.array([]) - i_to_remove[::-1].sort() - - if len(i_to_remove) > 0: - E_seq = np.delete(E_seq, i_to_remove, axis=0) - - if len(E_seq) == 0 and inverse: - self.i_anom_inv = np.array([]) - return - elif len(E_seq) == 0 and not inverse: - self.i_anom = np.array([]) - return - - indices_to_keep = np.concatenate([range(e_seq[0], e_seq[-1]+1) - for e_seq in E_seq]) - - if not inverse: - mask = np.isin(self.i_anom, indices_to_keep) - self.i_anom = self.i_anom[mask] - else: - mask_inv = np.isin(self.i_anom_inv, indices_to_keep) - self.i_anom_inv = self.i_anom_inv[mask_inv] - - def score_anomalies(self, prior_idx): - """ - Calculate anomaly scores based on max distance from epsilon - for each anomalous sequence. - - Args: - prior_idx (int): starting index of window within full set of test - values for channel - """ - - groups = [list(group) for group in mit.consecutive_groups(self.i_anom)] - - for e_seq in groups: - - score_dict = { - "start_idx": e_seq[0] + prior_idx, - "end_idx": e_seq[-1] + prior_idx, - "score": 0 - } - - score = max([abs(self.e_s[i] - self.epsilon) - / (self.mean_e_s + self.sd_e_s) for i in - range(e_seq[0], e_seq[-1] + 1)]) - inv_score = max([abs(self.e_s_inv[i] - self.epsilon_inv) - / (self.mean_e_s + self.sd_e_s) for i in - range(e_seq[0], e_seq[-1] + 1)]) - - # the max score indicates whether anomaly was from regular - # or inverted errors - score_dict['score'] = max([score, inv_score]) - self.anom_scores.append(score_dict) \ No newline at end of file diff --git a/tods/detection_algorithm/core/utils/modeling.py b/tods/detection_algorithm/core/utils/modeling.py deleted file mode 100644 index 6fb36510..00000000 --- a/tods/detection_algorithm/core/utils/modeling.py +++ /dev/null @@ -1,206 +0,0 @@ -from tensorflow.keras.models import Sequential, load_model -from tensorflow.keras.callbacks import History, EarlyStopping, Callback -from tensorflow.keras.layers import LSTM -from tensorflow.keras.layers import Dense, Activation, Dropout -from tensorflow.keras.layers import Flatten -import numpy as np -import os -import logging - -# suppress tensorflow CPU speedup warnings -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -logger = logging.getLogger('telemanom') - - -class Model: - def __init__(self, channel,patience,min_delta,layers,dropout,n_predictions,loss_metric, - optimizer,lstm_batch_size,epochs,validation_split,batch_size,l_s - ): - """ - Loads/trains RNN and predicts future telemetry values for a channel. - - Args: - config (obj): Config object containing parameters for processing - and model training - run_id (str): Datetime referencing set of predictions in use - channel (obj): Channel class object containing train/test data - for X,y for a single channel - - Attributes: - config (obj): see Args - chan_id (str): channel id - run_id (str): see Args - y_hat (arr): predicted channel values - model (obj): trained RNN model for predicting channel values - """ - - # self.config = config - # self.chan_id = channel.id - # self.run_id = run_id - self.y_hat = np.array([]) - self.model = None - - # self.save() - - self._patience = patience - self._min_delta = min_delta - self._layers = layers - self._dropout = dropout - self._n_predictions = n_predictions - self._loss_metric = loss_metric - self._optimizer = optimizer - self._lstm_batch_size = lstm_batch_size - self._epochs = epochs - self._validation_split = validation_split - self._batch_size = batch_size - self._l_s = l_s - - self.train_new(channel) - - - # def load(self): - # """ - # Load model for channel. - # """ - - # logger.info('Loading pre-trained model') - # self.model = load_model(os.path.join('data', self.config.use_id, - # 'models', self.chan_id + '.h5')) - - def train_new(self, channel): - """ - Train LSTM model according to specifications in config.yaml. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - """ - - cbs = [History(), EarlyStopping(monitor='val_loss', - patience=self._patience, - min_delta=self._min_delta, - verbose=1)] - - self.model = Sequential() - - self.model.add(LSTM( - self._layers[0], - input_shape=(None, channel.X_train.shape[2]), - return_sequences=True)) - self.model.add(Dropout(self._dropout)) - - self.model.add(LSTM( - self._layers[1], - return_sequences=False)) - self.model.add(Dropout(self._dropout)) - - self.model.add(Dense( - self._n_predictions - *channel.X_train.shape[2] - )) - self.model.add(Activation('linear')) - - self.model.compile(loss=self._loss_metric, - optimizer=self._optimizer) - - - # print(self.model.summary()) - - self.model.fit(channel.X_train, - channel.y_train, - batch_size=self._lstm_batch_size, - epochs=self._epochs, - shuffle=False, - validation_split=self._validation_split, - callbacks=cbs, - verbose=True) - - - - # def save(self): - # """ - # Save trained model. - # """ - - # self.model.save(os.path.join('data', self.run_id, 'models', - # '{}.h5'.format(self.chan_id))) - - def aggregate_predictions(self, y_hat_batch, method='mean'): # pragma: no cover - """ - Aggregates predictions for each timestep. When predicting n steps - ahead where n > 1, will end up with multiple predictions for a - timestep. - - Args: - y_hat_batch (arr): predictions shape (, = 0 else 0 - - # predictions pertaining to a specific timestep lie along diagonal - y_hat_t = np.flipud(y_hat_batch[start_idx:t+1]).diagonal() - - if method == 'first': - agg_y_hat_batch = np.append(agg_y_hat_batch, [y_hat_t[0]]) - elif method == 'mean': - agg_y_hat_batch = np.append(agg_y_hat_batch, np.mean(y_hat_t)) - - agg_y_hat_batch = agg_y_hat_batch.reshape(len(agg_y_hat_batch), 1) - self.y_hat = np.append(self.y_hat, agg_y_hat_batch) - - - - def batch_predict(self, channel): - """ - Used trained LSTM model to predict test data arriving in batches. - - Args: - channel (obj): Channel class object containing train/test data - for X,y for a single channel - - Returns: - channel (obj): Channel class object with y_hat values as attribute - """ - - # num_batches = int((y_test.shape[0] - self._l_s) - # / self._batch_size) - # if num_batches < 0: - # raise ValueError('l_s ({}) too large for stream length {}.' - # .format(self._l_s, y_test.shape[0])) - - # # simulate data arriving in batches, predict each batch - # for i in range(0, num_batches + 1): - # prior_idx = i * self._batch_size - # idx = (i + 1) * self._batch_size - - # if i + 1 == num_batches + 1: - # # remaining values won't necessarily equal batch size - # idx = y_test.shape[0] - - # X_test_batch = X_test[prior_idx:idx] - # y_hat_batch = self.model.predict(X_test_batch) - # y_hat_batch = np.reshape(y_hat_batch,(X_test.shape[0],self._n_predictions,X_test.shape[2])) - # # print("PREDICTIONS",y_hat_batch.shape) - # self.aggregate_predictions(y_hat_batch) - - # self.y_hat = np.reshape(self.y_hat, (self.y_hat.size,)) - - # channel.y_hat = self.y_hat - - # # np.save(os.path.join('data', self.run_id, 'y_hat', '{}.npy' - # # .format(self.chan_id)), self.y_hat) - - # return channel - - self.y_hat = self.model.predict(channel.X_test) - self.y_hat = np.reshape(self.y_hat,(channel.X_test.shape[0],self._n_predictions,channel.X_test.shape[2])) - # print("shape before ",self.y_hat.shape) - channel.y_hat = self.y_hat - return channel diff --git a/tods/detection_algorithm/core/utils/utils.py b/tods/detection_algorithm/core/utils/utils.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tods/searcher/out.csv b/tods/searcher/out.csv deleted file mode 100644 index c94fe37d..00000000 --- a/tods/searcher/out.csv +++ /dev/null @@ -1,4 +0,0 @@ -,F_beta,RECALL,PRECISION,F1,F1_MACRO,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,trial_id,experiment_tag,config/detection_algorithm,config/feature_analysis,config/timeseries_processing,logdir -0,0.4955436153414089,0.6,0.0218978102189781,0.9028571428571428,0.4955436153414089,1.822270154953003,False,,,1,e37bc4d5110342fdbcfd7a80dbdcfe09,2022-12-21_16-10-22,1671639022,1.822270154953003,37322,wydkM5,172.17.0.4,1.822270154953003,0,1,f7b9d_00000,"0_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],[['statistical_maximum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-21_16-10-20/_evaluate_f7b9d_00000_0_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum']],timeseries_processing=[['ti_2022-12-21_16-10-20" -1,0.5053877406818583,0.8,0.0305343511450381,0.9085714285714288,0.5053877406818583,0.6240348815917969,False,,,1,a425b9c0b2e84add8f724cb055e9468f,2022-12-21_16-10-23,1671639023,0.6240348815917969,37322,wydkM5,172.17.0.4,0.6240348815917969,0,1,f7b9d_00001,"1_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],"[['statistical_maximum'], ['statistical_minimum']]",[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-21_16-10-20/_evaluate_f7b9d_00001_1_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_maximum'], ['statistical_minimum']],ti_2022-12-21_16-10-22" -2,0.5021753681392235,0.8,0.0287769784172661,0.9028571428571428,0.5021753681392235,0.5039956569671631,False,,,1,f02848f0489e40c794c1211e0b02c428,2022-12-21_16-10-23,1671639023,0.5039956569671631,37322,wydkM5,172.17.0.4,0.5039956569671631,0,1,f7b9d_00002,"2_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['time_series_seasonality_trend_decomposition']]",[['pyod_loda']],[['statistical_minimum']],[['time_series_seasonality_trend_decomposition']],"/root/ray_results/_evaluate_2022-12-21_16-10-20/_evaluate_f7b9d_00002_2_detection_algorithm=[['pyod_loda']],feature_analysis=[['statistical_minimum']],timeseries_processing=[['ti_2022-12-21_16-10-23" diff --git a/tods/searcher/search_space/detection_algorithm_test_search_space.json b/tods/searcher/search_space/detection_algorithm_test_search_space.json new file mode 100644 index 00000000..8d8b8697 --- /dev/null +++ b/tods/searcher/search_space/detection_algorithm_test_search_space.json @@ -0,0 +1,78 @@ +{ + "timeseries_processing":{ + "subsequence_segmentation":{ + + } + }, + "detection_algorithm": { + "pyod_ae": { + + }, + "pyod_loda": { + + }, + "telemanom":{ + + }, + "pyod_vae":{ + + }, + "pyod_sogaal":{ + + }, + "pyod_sod":{ + + }, + "pyod_ocsvm":{ + + }, + "pyod_mogaal":{ + + }, + "pyod_lof":{ + + }, + "pyod_knn":{ + + }, + "pyod_iforest":{ + + }, + "pyod_hbos":{ + + }, + "pyod_cof":{ + + }, + "pyod_cblof":{ + + }, + "pyod_abod":{ + + }, + "PCAODetector":{ + + }, + "matrix_profile":{ + + }, + "LSTMODetector":{ + + }, + "KDiscordODetector":{ + + }, + "Ensemble":{ + + }, + "deeplog":{ + + }, + "dagmm":{ + + }, + "AutoRegODetector":{ + + } + } +} \ No newline at end of file diff --git a/tods/searcher/search_space/example_search_space.json b/tods/searcher/search_space/example_search_space.json new file mode 100644 index 00000000..9279c86e --- /dev/null +++ b/tods/searcher/search_space/example_search_space.json @@ -0,0 +1,66 @@ +{ + "timeseries_processing": { + "time_series_seasonality_trend_decomposition": { + "use_semantic_types": [ + 1, + 0 + ] + }, + "moving_average_transform":{ + "window_size":[ + 3, + 4, + 5 + ], + "norm":[ + "l1", + "l2", + "max" + ], + "use_semantic_types":[ + 0, + 1 + ] + } + }, + "feature_analysis": { + "statistical_h_mean": { + "window_size": [ + 10, + 20 + ] + }, + "statistical_maximum": { + "window_size": [ + 10, + 20 + ] + }, + "statistical_minimum": { + "window_size": [ + 10, + 20 + ] + } + }, + "detection_algorithm": { + "pyod_ae": { + "dropout_rate": [ + 0.1, + 0.2 + ] + }, + "pyod_loda": { + "n_bins": [ + 10, + 20 + ] + }, + "pyod_cof": { + "n_neighborss": [ + 15, + 20 + ] + } + } +} \ No newline at end of file diff --git a/tods/searcher/search_space/feature_analysis_test_search_space.json b/tods/searcher/search_space/feature_analysis_test_search_space.json new file mode 100644 index 00000000..51d67c31 --- /dev/null +++ b/tods/searcher/search_space/feature_analysis_test_search_space.json @@ -0,0 +1,17 @@ +{ + "timeseries_processing": { + "time_series_seasonality_trend_decomposition": { + "use_semantic_types": [ + 1, + 0 + ] + } + }, + "feature_analysis": { + "bk_filter": { }, + "non_negative_matrix_factorization": { }, + "statistical_variation": { }, + "wavelet_transform": { } + } + + } \ No newline at end of file diff --git a/tods/searcher/search_space/test.json b/tods/searcher/search_space/test.json new file mode 100644 index 00000000..76132427 --- /dev/null +++ b/tods/searcher/search_space/test.json @@ -0,0 +1,33 @@ +{ + "timeseries_processing": { + "time_series_seasonality_trend_decomposition": { + "use_semantic_types": [ + 1, + 0 + ] + } + }, + "feature_analysis": { + + "statistical_maximum": { + "window_size": [ + 10, + 20 + ] + }, + "statistical_minimum": { + "window_size": [ + 10, + 20 + ] + } + }, + "detection_algorithm": { + "pyod_loda": { + "n_bins": [ + 10, + 20 + ] + } + } + } \ No newline at end of file diff --git a/tods/searcher/search_space/timeseries_processing_test_search_space.json b/tods/searcher/search_space/timeseries_processing_test_search_space.json new file mode 100644 index 00000000..dac82716 --- /dev/null +++ b/tods/searcher/search_space/timeseries_processing_test_search_space.json @@ -0,0 +1,88 @@ +{ + "timeseries_processing": { + "moving_average_transform":{ + "window_size":[ + 3, + 4, + 5 + ], + "norm":[ + "l1", + "l2", + "max" + ], + "use_semantic_types":[ + 0, + 1 + ] + }, + "axiswise_scaler":{ + "with_mean":[ + 0, + 1 + ], + "with_std":[ + 0, + 1 + ] + }, + "power_transformer":{ + "method":[ + "yeo-johnson", + "box-cox" + ] + }, + "quantile_transformer":{ + "n_quantiles": 1000 + }, + "standard_scaler":{ + "with_mean":[ + 0, + 1 + ], + "with_std":[ + 0, + 1 + ] + }, + "simple_exponential_smoothing":{ + "endog":[ + 2, + 3 + ] + }, + "subsequence_segmentation":{ + "window_size":[ + 3, + 4, + 5 + ], + "step":[ + 1, + 2 + ] + }, + "time_series_seasonality_trend_decomposition": { + "use_semantic_types": [ + 1, + 0 + ] + } + }, + "feature_analysis": { + "statistical_minimum": { + "window_size": [ + 10, + 20 + ] + } + }, + "detection_algorithm": { + "pyod_loda": { + "n_bins": [ + 10, + 20 + ] + } + } + } \ No newline at end of file diff --git a/tods/searcher/searcher.py b/tods/searcher/searcher.py index 6ace8b23..35a08847 100644 --- a/tods/searcher/searcher.py +++ b/tods/searcher/searcher.py @@ -146,43 +146,7 @@ def search(self,search_space, config): return [self.get_search_result(primitive_analysis,config),self.get_search_result(hyperparam_analysis,config)] - def set_search_algorithm(self, algorithm): - """ - Determine which searcher to choose based on user needs - - Parameters - ---------- - algorithm: str - Name of the desired search algorithm to use - - Returns - ------- - searcher - """ - if algorithm == "random": - from ray.tune.suggest.basic_variant import BasicVariantGenerator - searcher = BasicVariantGenerator() # Random/Grid Searcher - elif algorithm == "hyperopt": - from ray.tune.suggest.hyperopt import HyperOptSearch - searcher = HyperOptSearch(max_concurrent=2, metric="RECALL") # HyperOpt Searcher - elif algorithm == "zoopt": - zoopt_search_config = { - "parallel_num": 64, # how many workers to parallel - } - from ray.tune.suggest.zoopt import ZOOptSearch - searcher = ZOOptSearch(budget=20, **zoopt_search_config) - elif algorithm == "skopt": - from ray.tune.suggest.skopt import SkOptSearch - searcher = SkOptSearch() - elif algorithm == "nevergrad": - import nevergrad as ng - from ray.tune.suggest.nevergrad import NevergradSearch - searcher = NevergradSearch( - optimizer=ng.optimizers.OnePlusOne) - else: - raise ValueError("Searching algorithm not supported.") - return searcher def get_search_result(self, analysis,config): @@ -282,235 +246,6 @@ def _evaluate(self, search_space): yield eval_metric - def build_pipeline(self, search_space): - """ - Build an outlier detection system - Args: - search_space: - A search space transform from a json file user designed - defines valid values for your hyperparameters and can specify how these values are sampled. - - """ - from d3m import index - from d3m.metadata.base import ArgumentType - from d3m.metadata.pipeline import Pipeline, PrimitiveStep - import sys - - primitive_map = {'axiswise_scaler': 'transformation', - 'standard_scaler': 'transformation', - 'power_transformer': 'transformation', - 'quantile_transformer': 'transformation', - 'moving_average_transform': 'transformation', - 'simple_exponential_smoothing': 'transformation', - 'holt_smoothing': 'transformation', - 'holt_winters_exponential_smoothing': 'transformation', - 'time_series_seasonality_trend_decomposition': 'decomposition', - 'subsequence_segmentation': '' - } - - pipeline_description = Pipeline() - pipeline_description.add_input(name='inputs') - - counter = 0 - - # Step 0: dataset_to_dataframe - step_0 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.dataset_to_dataframe')) - step_0.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='inputs.0') - step_0.add_output('produce') - pipeline_description.add_step(step_0) - counter += 1 - - # Step 1: column_parser - step_1 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.column_parser')) - step_1.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.0.produce') - step_1.add_output('produce') - pipeline_description.add_step(step_1) - counter += 1 - - # Step 2: extract_columns_by_semantic_types(attributes) - step_2 = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.extract_columns_by_semantic_types')) - step_2.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') - step_2.add_output('produce') - step_2.add_hyperparameter(name='semantic_types', argument_type=ArgumentType.VALUE, - data=['https://metadata.datadrivendiscovery.org/types/Attribute']) - pipeline_description.add_step(step_2) - counter += 1 - - - - if 'timeseries_processing' in search_space.keys(): - timeseries_processing_list = [] - - timeseries_processing = search_space.pop('timeseries_processing', None) - if ' ' in timeseries_processing: - timeseries_processing_list = timeseries_processing.split(' ') - else: - timeseries_processing_list.append(timeseries_processing) - - for x in range(len(timeseries_processing_list)): - this = sys.modules[__name__] - name = 'step_' + str(counter) - # setattr(this, name, PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.' + primitive_map[timeseries_processing_list[x]] + '.' + timeseries_processing_list[x]))) - # this.name = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.' + primitive_map[timeseries_processing_list[x]] + '.' + timeseries_processing_list[x])) - - - - setattr(this, name, PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.' + timeseries_processing_list[x]))) - this.name = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.timeseries_processing.' + timeseries_processing_list[x])) - - this.name.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(counter - 1) + '.produce') - for key, value in search_space.items(): - if timeseries_processing_list[x] in key: - hp_name = key.replace(timeseries_processing_list[x] + '_', '') - if value == "None": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=None) - elif value == "True": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=True) - elif value == "False": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=False) - else: - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=value) - this.name.add_output('produce') - pipeline_description.add_step(this.name) - counter += 1 - - - - - - - feature_analysis_list = [] - - feature_analysis = search_space.pop('feature_analysis', None) - if ' ' in feature_analysis: - feature_analysis_list = feature_analysis.split(' ') - else: - feature_analysis_list.append(feature_analysis) - - - for x in range(len(feature_analysis_list)): - this = sys.modules[__name__] - name = 'step_' + str(counter) - setattr(this, name, PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.' + feature_analysis_list[x]))) - this.name = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.feature_analysis.' + feature_analysis_list[x])) - - this.name.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(counter - 1) + '.produce') - for key, value in search_space.items(): - if feature_analysis_list[x] in key: - hp_name = key.replace(feature_analysis_list[x] + '_', '') - if value == "None": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=None) - elif value == "True": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=True) - elif value == "False": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=False) - else: - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=value) - this.name.add_output('produce') - pipeline_description.add_step(this.name) - counter += 1 - - - - - - detection_algorithm_list = [] - - detection_algorithm = search_space.pop('detection_algorithm', None) - if ' ' in detection_algorithm: - detection_algorithm_list = detection_algorithm.split(' ') - else: - detection_algorithm_list.append(detection_algorithm) - - for x in range(len(detection_algorithm_list)): - this = sys.modules[__name__] - name = 'step_' + str(counter) - setattr(this, name, PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.' + detection_algorithm_list[x]))) - this.name = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.detection_algorithm.' + detection_algorithm_list[x])) - # print(this.name.metadata['hyperparams_to_tune']) - this.name.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(counter - 1) + '.produce') - for key, value in search_space.items(): - if detection_algorithm_list[x] in key: - hp_name = key.replace(detection_algorithm_list[x] + '_', '') - if value == "None": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=None) - elif value == "True": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=True) - elif value == "False": - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=False) - else: - this.name.add_hyperparameter(name=hp_name, argument_type=ArgumentType.VALUE, data=value) - this.name.add_output('produce') - pipeline_description.add_step(this.name) - counter += 1 - - - - - - - - for i in range(1): - this = sys.modules[__name__] - name = 'step_' + str(counter) - setattr(this, name, PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions'))) - this.name = PrimitiveStep(primitive=index.get_primitive('d3m.primitives.tods.data_processing.construct_predictions')) - - this.name.add_argument(name='inputs', argument_type=ArgumentType.CONTAINER, data_reference='steps.' + str(counter - 1) + '.produce') - this.name.add_argument(name='reference', argument_type=ArgumentType.CONTAINER, data_reference='steps.1.produce') - this.name.add_output('produce') - pipeline_description.add_step(this.name) - counter += 1 - - - - - pipeline_description.add_output(name='output predictions', data_reference='steps.' + str(counter - 1) + '.produce') - data = pipeline_description.to_json() - - # input() - return pipeline_description - - - - def clearer_best_config(self, best_config): - """ - Output the best config in a clearer format - - Parameters - ---------- - best_config: - A config which achieves the best performance - obtained by searcher analysis - - Returns - ------- - None - """ - - print('the best choice for timeseries_processing is: ', best_config['timeseries_processing']) - for key, value in best_config.items(): - temp = best_config['timeseries_processing'].split(" ") - for i in temp: - if (i + '_') in key: - print("the best" + key.replace(i + '_', " ") + " for " + - i + ": " + str(value)) - - print('the best choice for feature analysis is: ', best_config['feature_analysis']) - for key, value in best_config.items(): - temp = best_config['feature_analysis'].split(" ") - for i in temp: - if (i + '_') in key: - print("the best" + key.replace(i + '_', " ") + " for " + - i + ": " + str(value)) - - print('the best choice for detection algorithm is: ', best_config['detection_algorithm']) - for key, value in best_config.items(): - temp = best_config['detection_algorithm'].split(" ") - for i in temp: - if (i + '_') in key: - print("the best" + key.replace(i + '_', " ") + " for " + - i + ": " + str(value)) def find_best_pipeline(self, best_config, results_dataframe): """ diff --git a/tods/tests/searcher/test_pipeline.py b/tods/tests/searcher/test_pipeline.py index d8971483..8ec3784f 100644 --- a/tods/tests/searcher/test_pipeline.py +++ b/tods/tests/searcher/test_pipeline.py @@ -30,6 +30,14 @@ ['statistical_minimum',]], #Specify hyperparams as k,v pairs } +config_system = {'detection_algorithm': [ + ('pyod_ocsvm',) +], + + 'feature_analysis': [ + ('statistical_maximum',), + ] +} # table_path = '../../../datasets/anomaly/raw_data/yahoo_sub_5.csv' # df = pd.read_csv(table_path) # dataset = generate_dataset(df, 6) @@ -217,6 +225,277 @@ def test_build_pipeline(self): self.assertEqual(self.built_pipeline.to_json_structure()['inputs'],pipeline_description['inputs']) self.assertEqual(self.built_pipeline.to_json_structure()['outputs'],pipeline_description['outputs']) + def build_system_pipeline(self): + pipeline_description = { + "id": "73e15443-4ee7-40d5-8b76-a01b06333d50", + "schema": "https://metadata.datadrivendiscovery.org/schemas/v0/pipeline.json", + "created": "2023-01-30T16:39:07.005212Z", + "inputs": [ + { + "name": "inputs" + } + ], + "outputs": [ + { + "data": "steps.9.produce", + "name": "output predictions" + } + ], + "steps": [ + { + "type": "PRIMITIVE", + "primitive": { + "id": "f31f8c1f-d1c5-43e5-a4b2-2ae4a761ef2e", + "version": "0.2.0", + "python_path": "d3m.primitives.tods.common.denormalize", + "name": "Denormalize datasets" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "inputs.0" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "c78138d9-9377-31dc-aee8-83d9df049c60", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.data_processing.dataset_to_dataframe", + "name": "Extract a DataFrame from a Dataset" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.0.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "989562ac-b50f-4462-99cb-abef80d765b2", + "version": "0.1.0", + "python_path": "d3m.primitives.tods.common.csv_reader", + "name": "Columns CSV reader" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "use_columns": { + "type": "VALUE", + "data": [ + 0, + 1 + ] + }, + "return_result": { + "type": "VALUE", + "data": "replace" + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "81235c29-aeb9-3828-911a-1b25319b6998", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.data_processing.column_parser", + "name": "Parses strings into their types" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.2.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "parse_semantic_types": { + "type": "VALUE", + "data": [ + "http://schema.org/Boolean", + "http://schema.org/Integer", + "http://schema.org/Float", + "https://metadata.datadrivendiscovery.org/types/FloatVector" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "a996cd89-ddf0-367f-8e7f-8c013cbc2891", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.data_processing.extract_columns_by_semantic_types", + "name": "Extracts columns by semantic type" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/Attribute" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "a996cd89-ddf0-367f-8e7f-8c013cbc2891", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.data_processing.extract_columns_by_semantic_types", + "name": "Extracts columns by semantic type" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.3.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ], + "hyperparams": { + "semantic_types": { + "type": "VALUE", + "data": [ + "https://metadata.datadrivendiscovery.org/types/TrueTarget" + ] + } + } + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "f07ce875-bbc7-36c5-9cc1-ba4bfb7cf48e", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.feature_analysis.statistical_maximum", + "name": "Time Series Decompostional" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.4.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "b454adf7-5820-3e6f-8383-619f13fb1cb6", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.detection_algorithm.pyod_ocsvm", + "name": "TODS.anomaly_detection_primitives.OCSVMPrimitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.6.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "01d36760-235c-3cdd-95dd-3c682c634c49", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.detection_algorithm.system_wise_detection", + "name": "Sytem_Wise_Anomaly_Detection_Primitive" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.7.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + }, + { + "type": "PRIMITIVE", + "primitive": { + "id": "2530840a-07d4-3874-b7d8-9eb5e4ae2bf3", + "version": "0.3.0", + "python_path": "d3m.primitives.tods.data_processing.construct_predictions", + "name": "Construct pipeline predictions output" + }, + "arguments": { + "inputs": { + "type": "CONTAINER", + "data": "steps.8.produce" + }, + "reference": { + "type": "CONTAINER", + "data": "steps.1.produce" + } + }, + "outputs": [ + { + "id": "produce" + } + ] + } + ], + "digest": "193c74a8386c80f5ce81ab8d979eef97f46901cf63c70d45c3b4a2064b3df4c9" +} + self.built_system_pipeline = build_pipeline(config_system) + + self.assertIsInstance(self.built_pipeline,Pipeline) + self.assertEqual(self.built_pipeline.to_json_structure()['steps'],pipeline_description['steps']) + self.assertEqual(self.built_pipeline.to_json_structure()['schema'],pipeline_description['schema']) + self.assertEqual(self.built_pipeline.to_json_structure()['inputs'],pipeline_description['inputs']) + self.assertEqual(self.built_pipeline.to_json_structure()['outputs'],pipeline_description['outputs']) def test_generate_problem(self): self.generated_dataset = generate_dataset(dataframe,6) self.assertIsInstance(self.generated_dataset,Dataset) diff --git a/tods/tests/searcher/test_searcher.py b/tods/tests/searcher/test_searcher.py index 855a1624..34d5e6f2 100644 --- a/tods/tests/searcher/test_searcher.py +++ b/tods/tests/searcher/test_searcher.py @@ -95,7 +95,7 @@ def test_searcher_simple_searchspace(self): else: self.assertEqual(hyperparameter_search_result['best_config'], best_pipeline) - def test_searcher_simple_searchspace(self): + def test_searcher_exhaustive_searchspace(self): config = { "metric":'F1_MACRO', "num_samples": 1, @@ -121,8 +121,7 @@ def test_searcher_simple_searchspace(self): if i ==0: best_pipeline = search_result['best_config'] else: - self.assertEqual(search_result['best_config'], best_pipeline) - + self.assertEqual(search_result['best_config'], best_pipeline) if __name__ == '__main__':