From e1dedbc6df75aac863cd16d6cec4097a62930ff0 Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Tue, 28 May 2019 20:17:32 -0700 Subject: [PATCH 1/4] adding baseline functionality for benchmark script --- scripts/benchmark.py | 141 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 112 insertions(+), 29 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 34b7bd6e..dbfc7a35 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -10,6 +10,8 @@ from monotonic import monotonic +from detect_secrets.core.color import AnsiColor +from detect_secrets.core.color import colorize from detect_secrets.core.usage import PluginOptions @@ -92,7 +94,7 @@ def get_arguments(): parser.add_argument( '--harakiri', default=5, - type=float, + type=assert_positive(float), help=( 'Specifies an upper bound for the number of seconds to wait ' 'per execution.' @@ -102,23 +104,35 @@ def get_arguments(): '-n', '--num-iterations', default=1, - type=assert_positive_integer, + type=assert_positive(int), help=( 'Specifies the number of times to run the test. ' 'Results will be averaged over this value.' ), ) + parser.add_argument( + '--baseline', + type=assert_valid_file, + help=( + 'If provided, will compare performance with provided baseline. ' + 'Assumes pretty output (otherwise, you can do the comparison ' + 'yourself).' + ), + ) args = parser.parse_args() if not args.filenames: - args.filenames = [ - os.path.realpath( - os.path.join( - os.path.dirname(__file__), - '../', + if args.baseline: + args.filenames = args.baseline['filenames'] + else: + args.filenames = [ + os.path.realpath( + os.path.join( + os.path.dirname(__file__), + '../', + ), ), - ), - ] + ] if not args.plugin: args.plugin = plugins @@ -126,16 +140,30 @@ def get_arguments(): return args -def assert_positive_integer(string): - value = int(string) - if value <= 0: +def assert_positive(type): + def wrapped(string): + value = type(string) + if value <= 0: + raise argparse.ArgumentTypeError( + '{} must be a positive {}.'.format( + string, + type.__name__, + ), + ) + + return value + + return wrapped + + +def assert_valid_file(string): + if not os.path.isfile(string): raise argparse.ArgumentTypeError( - '{} must be a positive integer.'.format( - string, - ), + '{} must be a valid file.'.format(string), ) - return value + with open(string) as f: + return json.load(f) def time_execution(filenames, timeout, num_iterations=1, flags=None): @@ -166,7 +194,7 @@ def time_execution(filenames, timeout, num_iterations=1, flags=None): if result == timeout: return None - return statistics.mean(scores) + return round(statistics.mean(scores), 5) def print_output(timings, args): @@ -174,31 +202,86 @@ def print_output(timings, args): :type timings: dict :type args: Namespace """ - if not args.pretty: - print(json.dumps(timings)) + if not args.pretty and not args.baseline: + print( + json.dumps({ + 'filenames': args.filenames, + 'timings': timings, + }), + ) return # Print header - print('-' * 42) - print('{:<20s}{:>20s}'.format('plugin', 'time')) - print('-' * 42) + baseline = args.baseline['timings'] if args.baseline else {} + if not baseline: + print('-' * 40) + print('{:<25s}{:>15s}'.format('plugin', 'time')) + print('-' * 40) + else: + print('-' * 57) + print('{:<25s}{:>13s}{:>16s}'.format('plugin', 'time', 'change')) + print('-' * 57) + # Print content if 'all-plugins' in timings: - print_line('all-plugins', timings['all-plugins']) + print_line( + 'All Plugins', + timings['all-plugins'], + baseline.get('all-plugins'), + ) del timings['all-plugins'] for key in sorted(timings): - print_line(key, timings[key]) - print('-' * 42) + print_line(key, timings[key], baseline.get(key)) + + # Print footer line + if not args.baseline: + print('-' * 40) + else: + print('-' * 57) + + +def print_line(name, time, baseline): + """ + :type name: str + :type time: float + :param time: seconds it took to execute -def print_line(name, time): + :type baseline: float + :param baseline: expected seconds to execute + """ if not time: - time = 'Timeout exceeded!' + time_string = 'Timeout exceeded!' else: - time = '{}s'.format(str(time)) + time_string = '{}s'.format(str(time)) + + if baseline: + difference = round(baseline - time, 2) + if difference > 0: + difference_string = colorize( + '▲ {}'.format(difference), + AnsiColor.LIGHT_GREEN, + ) + difference_string = '{:>24s}'.format(difference_string) + elif difference < 0: + difference_string = colorize( + '▼ {}'.format(difference), + AnsiColor.RED, + ) + difference_string = '{:>24s}'.format(difference_string) + else: + difference_string = '{:>12s}'.format('-') - print('{:<20s}{:>20s}'.format(name, time)) + print( + '{:<25s}{:>15s}{}'.format( + name, + time_string, + difference_string, + ), + ) + else: + print('{:<25s}{:>15s}'.format(name, time_string)) if __name__ == '__main__': From 323df84db3bc05862596ea325f15953076f7af7b Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Tue, 28 May 2019 23:01:08 -0700 Subject: [PATCH 2/4] adding performance tests --- detect_secrets/util.py | 10 ++ scripts/benchmark.py | 10 +- scripts/run_performance_tests.py | 241 +++++++++++++++++++++++++++ test_data/performance/best-songs.txt | 52 ++++++ 4 files changed, 305 insertions(+), 8 deletions(-) create mode 100644 detect_secrets/util.py create mode 100644 scripts/run_performance_tests.py create mode 100644 test_data/performance/best-songs.txt diff --git a/detect_secrets/util.py b/detect_secrets/util.py new file mode 100644 index 00000000..7a135a42 --- /dev/null +++ b/detect_secrets/util.py @@ -0,0 +1,10 @@ +import os + + +def get_root_directory(): + return os.path.realpath( + os.path.join( + os.path.dirname(__file__), + '../', + ), + ) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index dbfc7a35..8dbe7038 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -13,6 +13,7 @@ from detect_secrets.core.color import AnsiColor from detect_secrets.core.color import colorize from detect_secrets.core.usage import PluginOptions +from detect_secrets.util import get_root_directory def main(): @@ -125,14 +126,7 @@ def get_arguments(): if args.baseline: args.filenames = args.baseline['filenames'] else: - args.filenames = [ - os.path.realpath( - os.path.join( - os.path.dirname(__file__), - '../', - ), - ), - ] + args.filenames = [get_root_directory()] if not args.plugin: args.plugin = plugins diff --git a/scripts/run_performance_tests.py b/scripts/run_performance_tests.py new file mode 100644 index 00000000..1c0f86d6 --- /dev/null +++ b/scripts/run_performance_tests.py @@ -0,0 +1,241 @@ +#!/usr/bin/env python3 +from __future__ import print_function + +import argparse +import json +import os +import random +import subprocess +import sys +import tempfile +from enum import Enum + +from detect_secrets.util import get_root_directory + + +class TestCase(Enum): + LONG_FILE = 1 + LONG_LINES = 2 + + +def main(): + args = parse_args() + + # Get data from baseline + if args.baseline: + config = args.baseline['config'] + args.mode = config['mode'] + args.length = config['length'] + + mode = None + for case in TestCase: + if case.name == args.mode: + mode = case + break + + content = generate_test_content( + mode, + timeout=args.harakiri, + length=args.length, + ) + output = scan_content( + content, + timeout=args.harakiri, + baseline=args.baseline, + ) + + if not args.baseline: + temp = json.loads(output) + temp['config'] = { + 'mode': mode.name, + 'length': args.length, + } + + output = json.dumps(temp) + + print(output) + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + '--harakiri', + default=30, + type=assert_positive(float), + help=( + 'Specifies an upper bound for number of seconds to wait for ' + 'each test.' + ), + ) + parser.add_argument( + '--baseline', + type=assert_valid_file, + help=( + 'Specifies test config file to run. If this is provided, ' + 'all config options will be referenced from this file.' + ), + ) + parser.add_argument( + '-m', + '--mode', + choices=[ + value.name + for value in TestCase + ], + required=True, + help='Specifies the type of content to generate.', + ) + parser.add_argument( + '-L', + '--length', + type=assert_positive(int), + help='Length of test case content.', + ) + + return parser.parse_args() + + +def assert_positive(type): + def wrapped(string): + value = type(string) + if value <= 0: + raise argparse.ArgumentTypeError( + '{} must be a positive {}.'.format( + string, + type.__name__, + ), + ) + + return value + + return wrapped + + +def assert_valid_file(string): + if not os.path.isfile(string): + raise argparse.ArgumentTypeError( + '{} must be a valid file.'.format(string), + ) + + with open(string) as f: + return json.load(f) + + +def generate_test_content(mode, **kwargs): + """ + :type mode: TestCase + :type length: int + :type timeout: float + """ + if not kwargs['length']: + del kwargs['length'] + + print('Generating content...', file=sys.stderr) + if mode == TestCase.LONG_FILE: + return generate_long_file(**kwargs) + elif mode == TestCase.LONG_LINES: + return generate_long_lines(**kwargs) + + +def scan_content(content, timeout, baseline=None): + """ + :type content: str + :type timeout: float + :type baseline: dict|None + """ + args = [ + 'python', + os.path.join( + get_root_directory(), + 'scripts/benchmark.py', + ), + '--harakiri', str(timeout), + ] + + with tempfile.NamedTemporaryFile('w') as f: + f.write(content) + + print('Running checks...', file=sys.stderr) + if not baseline: + args.append(f.name) + return subprocess.check_output( + args, + stderr=subprocess.DEVNULL, + ).decode('utf-8') + + with tempfile.NamedTemporaryFile('w') as b: + b.write( + json.dumps({ + 'filenames': [f.name], + 'timings': baseline['timings'], + }), + ) + b.seek(0) + + args.append('--baseline') + args.append(b.name) + + return subprocess.check_output( + args, + stderr=subprocess.DEVNULL, + ).decode('utf-8') + + +def generate_long_file(length=250000, **kwargs): + return generate_content( + separator='\n', + length=length, + ) + + +def generate_long_lines(length=250000, **kwargs): + return generate_content( + separator=' ', + length=length, + ) + + +def generate_content(separator, length): + """ + :type secret: str + :type separator: str + :type length: int + """ + valid_secrets = { + 'AWSKeyDetector': 'AKIATESTTESTTESTTEST', + 'ArtifactoryDetector': ':AKCtestTESTte', + 'Base64HighEntropyString': 'Y29uZ3JhdHVsYXRpb25zISB0aGlzIGlzIGEgaGlkZGVuIG1lc3NhZ2U=', + 'BasicAuthDetector': 'http://username:password@example.com', + 'HexHighEntropyString': '123456abcd', + 'KeywordDetector': 'api_key = foobar', + 'PrivateKeyDetector': 'BEGIN PRIVATE KEY', + 'SlackDetector': 'xoxb-1-test', + 'StripeDetector': 'rk_live_TESTtestTESTtestTESTtest', + } + + with open( + os.path.join( + get_root_directory(), + 'test_data/performance/best-songs.txt', + ), + ) as f: + source_material = f.read().splitlines() + + indexes = {} + for key in valid_secrets: + index = random.randint(0, length - 1) + indexes[index] = key + + content = [] + for line_number in range(length): + if line_number in indexes: + content.append(valid_secrets[indexes[line_number]]) + else: + random_line = random.randint(0, len(source_material) - 1) + content.append(source_material[random_line]) + + return separator.join(content) + + +if __name__ == '__main__': + main() diff --git a/test_data/performance/best-songs.txt b/test_data/performance/best-songs.txt new file mode 100644 index 00000000..bd77eb34 --- /dev/null +++ b/test_data/performance/best-songs.txt @@ -0,0 +1,52 @@ +Is this the real life? +Is this just fantasy? +Caught in a landslide +No escape from reality +Open your eyes +Look up to the skies and see +I'm just a poor boy, I need no sympathy +Because I'm easy come, easy go +A little high, little low +Anyway the wind blows, doesn't really matter to me, to me +Mama, just killed a man +Put a gun against his head +Pulled my trigger, now he's dead +Mama, life had just begun +But now I've gone and thrown it all away +Mama, oh oh +Didn't mean to make you cry +If I'm not back again this time tomorrow +Carry on, carry on, as if nothing really matters +Too late, my time has come +Sends shivers down my spine +Body's aching all the time +Goodbye everybody I've got to go +Gotta leave you all behind and face the truth +Mama, oh oh (anyway the wind blows) +I don't want to die +Sometimes wish I'd never been born at all +I see a little silhouetto of a man +Scaramouch, Scaramouch will you do the Fandango +Thunderbolt and lightning very very frightening me +Gallileo, Gallileo, Gallileo, Gallileo, Gallileo, figaro, magnifico +I'm just a poor boy and nobody loves me +He's just a poor boy from a poor family +Spare him his life from this monstrosity +Easy come easy go will you let me go +Bismillah, no we will not let you go, let him go +Bismillah, we will not let you go, let him go +Bismillah, we will not let you go, let me go +(Will not let you go) let me go (never, never let you go) let me go (never let me go) +Oh oh no, no, no, no, no, no, no +Oh mama mia, mama mia, mama mia let me go +Beelzebub has a devil put aside for me for me for me +So you think you can stop me and spit in my eye +So you think you can love me and leave me to die +Oh baby can't do this to me baby +Just gotta get out just gotta get right outta here +Oh oh oh yeah, oh oh yeah +Nothing really matters +Anyone can see +Nothing really matters +Nothing really matters to me +Anyway the wind blows From 1e29d5ffa028d096367c74774c4cc73c34bbfd51 Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Wed, 29 May 2019 21:01:00 -0700 Subject: [PATCH 3/4] bug fix: handling baseline output when timeout was exceeded for benchmark script --- scripts/benchmark.py | 67 ++++++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 18 deletions(-) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 8dbe7038..66f5c21f 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -208,34 +208,50 @@ def print_output(timings, args): # Print header baseline = args.baseline['timings'] if args.baseline else {} if not baseline: - print('-' * 40) + print('-' * 45) print('{:<25s}{:>15s}'.format('plugin', 'time')) - print('-' * 40) + print('-' * 45) else: - print('-' * 57) - print('{:<25s}{:>13s}{:>16s}'.format('plugin', 'time', 'change')) - print('-' * 57) + print('-' * 60) + print('{:<25s}{:>11s}{:>22s}'.format('plugin', 'time', 'change')) + print('-' * 60) # Print content if 'all-plugins' in timings: print_line( 'All Plugins', - timings['all-plugins'], - baseline.get('all-plugins'), + time=timings['all-plugins'], + baseline=_get_baseline_value(baseline, 'all-plugins'), + timeout=args.harakiri, ) del timings['all-plugins'] for key in sorted(timings): - print_line(key, timings[key], baseline.get(key)) + print_line( + key, + time=timings[key], + baseline=_get_baseline_value(baseline, key), + timeout=args.harakiri, + ) # Print footer line if not args.baseline: - print('-' * 40) + print('-' * 45) else: - print('-' * 57) + print('-' * 60) + + +def _get_baseline_value(baseline, key): + """ + We need to distinguish between no baseline mode (which should return + None as a value), baseline mode with exceeded timeout (which is stored + as None, but should return 0). + """ + if key in baseline: + return 0 if baseline[key] is None else baseline[key] -def print_line(name, time, baseline): +def print_line(name, time, baseline, timeout): """ :type name: str @@ -244,38 +260,53 @@ def print_line(name, time, baseline): :type baseline: float :param baseline: expected seconds to execute + + :type timeout: float + :param timeout: used to calculate difference when either current + execution or baseline execution exceeds timeout. """ if not time: time_string = 'Timeout exceeded!' else: time_string = '{}s'.format(str(time)) - if baseline: - difference = round(baseline - time, 2) + if baseline is not None: + if time and baseline: + difference = round(baseline - time, 2) + elif time: + # This handles the case when the baseline execution exceeds timeout + difference = round(timeout - time, 2) + elif baseline: + # This handles the case when this current execution exceeds timeout + difference = round(timeout - baseline, 2) + else: + # They both failed. + difference = 0 + if difference > 0: difference_string = colorize( '▲ {}'.format(difference), AnsiColor.LIGHT_GREEN, ) - difference_string = '{:>24s}'.format(difference_string) + difference_string = '{:>22s}'.format(difference_string) elif difference < 0: difference_string = colorize( '▼ {}'.format(difference), AnsiColor.RED, ) - difference_string = '{:>24s}'.format(difference_string) + difference_string = '{:>22s}'.format(difference_string) else: - difference_string = '{:>12s}'.format('-') + difference_string = '{:>10s}'.format('-') print( - '{:<25s}{:>15s}{}'.format( + '{:<25s}{:^20s}{}'.format( name, time_string, difference_string, ), ) else: - print('{:<25s}{:>15s}'.format(name, time_string)) + print('{:<25s}{:>20s}'.format(name, time_string)) if __name__ == '__main__': From 662948803ab01996faa64e3519024faaaa421a86 Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Wed, 29 May 2019 21:11:44 -0700 Subject: [PATCH 4/4] adding performance test cases --- test_data/performance/long-file.test.json | 21 +++++++++++++++++++++ test_data/performance/long-lines.test.json | 21 +++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 test_data/performance/long-file.test.json create mode 100644 test_data/performance/long-lines.test.json diff --git a/test_data/performance/long-file.test.json b/test_data/performance/long-file.test.json new file mode 100644 index 00000000..6ee24f4f --- /dev/null +++ b/test_data/performance/long-file.test.json @@ -0,0 +1,21 @@ +{ + "config": { + "length": null, + "mode": "LONG_FILE" + }, + "filenames": [ + "/tmp/tmpxmbg9rgl" + ], + "timings": { + "AWSKeyDetector": 1.40741, + "ArtifactoryDetector": 1.89773, + "Base64HighEntropyString": 2.89092, + "BasicAuthDetector": 1.35538, + "HexHighEntropyString": 2.8728, + "KeywordDetector": 2.45626, + "PrivateKeyDetector": 1.85466, + "SlackDetector": 1.50198, + "StripeDetector": 1.40959, + "all-plugins": 16.10408 + } +} diff --git a/test_data/performance/long-lines.test.json b/test_data/performance/long-lines.test.json new file mode 100644 index 00000000..0642d920 --- /dev/null +++ b/test_data/performance/long-lines.test.json @@ -0,0 +1,21 @@ +{ + "config": { + "length": null, + "mode": "LONG_LINES" + }, + "filenames": [ + "/tmp/tmp7iwzjvol" + ], + "timings": { + "AWSKeyDetector": 1.33812, + "ArtifactoryDetector": 1.95065, + "Base64HighEntropyString": 22.52337, + "BasicAuthDetector": 1.3325, + "HexHighEntropyString": 22.31473, + "KeywordDetector": 1.64396, + "PrivateKeyDetector": 1.35953, + "SlackDetector": 1.41018, + "StripeDetector": 1.49091, + "all-plugins": null + } +}