From e1dedbc6df75aac863cd16d6cec4097a62930ff0 Mon Sep 17 00:00:00 2001
From: Aaron Loo <domanchi@users.noreply.github.com>
Date: Tue, 28 May 2019 20:17:32 -0700
Subject: [PATCH 1/4] adding baseline functionality for benchmark script

---
 scripts/benchmark.py | 141 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 112 insertions(+), 29 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 34b7bd6e..dbfc7a35 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -10,6 +10,8 @@
 
 from monotonic import monotonic
 
+from detect_secrets.core.color import AnsiColor
+from detect_secrets.core.color import colorize
 from detect_secrets.core.usage import PluginOptions
 
 
@@ -92,7 +94,7 @@ def get_arguments():
     parser.add_argument(
         '--harakiri',
         default=5,
-        type=float,
+        type=assert_positive(float),
         help=(
             'Specifies an upper bound for the number of seconds to wait '
             'per execution.'
@@ -102,23 +104,35 @@ def get_arguments():
         '-n',
         '--num-iterations',
         default=1,
-        type=assert_positive_integer,
+        type=assert_positive(int),
         help=(
             'Specifies the number of times to run the test. '
             'Results will be averaged over this value.'
         ),
     )
+    parser.add_argument(
+        '--baseline',
+        type=assert_valid_file,
+        help=(
+            'If provided, will compare performance with provided baseline. '
+            'Assumes pretty output (otherwise, you can do the comparison '
+            'yourself).'
+        ),
+    )
 
     args = parser.parse_args()
     if not args.filenames:
-        args.filenames = [
-            os.path.realpath(
-                os.path.join(
-                    os.path.dirname(__file__),
-                    '../',
+        if args.baseline:
+            args.filenames = args.baseline['filenames']
+        else:
+            args.filenames = [
+                os.path.realpath(
+                    os.path.join(
+                        os.path.dirname(__file__),
+                        '../',
+                    ),
                 ),
-            ),
-        ]
+            ]
 
     if not args.plugin:
         args.plugin = plugins
@@ -126,16 +140,30 @@ def get_arguments():
     return args
 
 
-def assert_positive_integer(string):
-    value = int(string)
-    if value <= 0:
+def assert_positive(type):
+    def wrapped(string):
+        value = type(string)
+        if value <= 0:
+            raise argparse.ArgumentTypeError(
+                '{} must be a positive {}.'.format(
+                    string,
+                    type.__name__,
+                ),
+            )
+
+        return value
+
+    return wrapped
+
+
+def assert_valid_file(string):
+    if not os.path.isfile(string):
         raise argparse.ArgumentTypeError(
-            '{} must be a positive integer.'.format(
-                string,
-            ),
+            '{} must be a valid file.'.format(string),
         )
 
-    return value
+    with open(string) as f:
+        return json.load(f)
 
 
 def time_execution(filenames, timeout, num_iterations=1, flags=None):
@@ -166,7 +194,7 @@ def time_execution(filenames, timeout, num_iterations=1, flags=None):
     if result == timeout:
         return None
 
-    return statistics.mean(scores)
+    return round(statistics.mean(scores), 5)
 
 
 def print_output(timings, args):
@@ -174,31 +202,86 @@ def print_output(timings, args):
     :type timings: dict
     :type args: Namespace
     """
-    if not args.pretty:
-        print(json.dumps(timings))
+    if not args.pretty and not args.baseline:
+        print(
+            json.dumps({
+                'filenames': args.filenames,
+                'timings': timings,
+            }),
+        )
         return
 
     # Print header
-    print('-' * 42)
-    print('{:<20s}{:>20s}'.format('plugin', 'time'))
-    print('-' * 42)
+    baseline = args.baseline['timings'] if args.baseline else {}
+    if not baseline:
+        print('-' * 40)
+        print('{:<25s}{:>15s}'.format('plugin', 'time'))
+        print('-' * 40)
+    else:
+        print('-' * 57)
+        print('{:<25s}{:>13s}{:>16s}'.format('plugin', 'time', 'change'))
+        print('-' * 57)
 
+    # Print content
     if 'all-plugins' in timings:
-        print_line('all-plugins', timings['all-plugins'])
+        print_line(
+            'All Plugins',
+            timings['all-plugins'],
+            baseline.get('all-plugins'),
+        )
         del timings['all-plugins']
 
     for key in sorted(timings):
-        print_line(key, timings[key])
-    print('-' * 42)
+        print_line(key, timings[key], baseline.get(key))
+
+    # Print footer line
+    if not args.baseline:
+        print('-' * 40)
+    else:
+        print('-' * 57)
+
+
+def print_line(name, time, baseline):
+    """
+    :type name: str
 
+    :type time: float
+    :param time: seconds it took to execute
 
-def print_line(name, time):
+    :type baseline: float
+    :param baseline: expected seconds to execute
+    """
     if not time:
-        time = 'Timeout exceeded!'
+        time_string = 'Timeout exceeded!'
     else:
-        time = '{}s'.format(str(time))
+        time_string = '{}s'.format(str(time))
+
+    if baseline:
+        difference = round(baseline - time, 2)
+        if difference > 0:
+            difference_string = colorize(
+                '▲  {}'.format(difference),
+                AnsiColor.LIGHT_GREEN,
+            )
+            difference_string = '{:>24s}'.format(difference_string)
+        elif difference < 0:
+            difference_string = colorize(
+                '▼ {}'.format(difference),
+                AnsiColor.RED,
+            )
+            difference_string = '{:>24s}'.format(difference_string)
+        else:
+            difference_string = '{:>12s}'.format('-')
 
-    print('{:<20s}{:>20s}'.format(name, time))
+        print(
+            '{:<25s}{:>15s}{}'.format(
+                name,
+                time_string,
+                difference_string,
+            ),
+        )
+    else:
+        print('{:<25s}{:>15s}'.format(name, time_string))
 
 
 if __name__ == '__main__':

From 323df84db3bc05862596ea325f15953076f7af7b Mon Sep 17 00:00:00 2001
From: Aaron Loo <domanchi@users.noreply.github.com>
Date: Tue, 28 May 2019 23:01:08 -0700
Subject: [PATCH 2/4] adding performance tests

---
 detect_secrets/util.py               |  10 ++
 scripts/benchmark.py                 |  10 +-
 scripts/run_performance_tests.py     | 241 +++++++++++++++++++++++++++
 test_data/performance/best-songs.txt |  52 ++++++
 4 files changed, 305 insertions(+), 8 deletions(-)
 create mode 100644 detect_secrets/util.py
 create mode 100644 scripts/run_performance_tests.py
 create mode 100644 test_data/performance/best-songs.txt

diff --git a/detect_secrets/util.py b/detect_secrets/util.py
new file mode 100644
index 00000000..7a135a42
--- /dev/null
+++ b/detect_secrets/util.py
@@ -0,0 +1,10 @@
+import os
+
+
+def get_root_directory():
+    return os.path.realpath(
+        os.path.join(
+            os.path.dirname(__file__),
+            '../',
+        ),
+    )
diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index dbfc7a35..8dbe7038 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -13,6 +13,7 @@
 from detect_secrets.core.color import AnsiColor
 from detect_secrets.core.color import colorize
 from detect_secrets.core.usage import PluginOptions
+from detect_secrets.util import get_root_directory
 
 
 def main():
@@ -125,14 +126,7 @@ def get_arguments():
         if args.baseline:
             args.filenames = args.baseline['filenames']
         else:
-            args.filenames = [
-                os.path.realpath(
-                    os.path.join(
-                        os.path.dirname(__file__),
-                        '../',
-                    ),
-                ),
-            ]
+            args.filenames = [get_root_directory()]
 
     if not args.plugin:
         args.plugin = plugins
diff --git a/scripts/run_performance_tests.py b/scripts/run_performance_tests.py
new file mode 100644
index 00000000..1c0f86d6
--- /dev/null
+++ b/scripts/run_performance_tests.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+from __future__ import print_function
+
+import argparse
+import json
+import os
+import random
+import subprocess
+import sys
+import tempfile
+from enum import Enum
+
+from detect_secrets.util import get_root_directory
+
+
+class TestCase(Enum):
+    LONG_FILE = 1
+    LONG_LINES = 2
+
+
+def main():
+    args = parse_args()
+
+    # Get data from baseline
+    if args.baseline:
+        config = args.baseline['config']
+        args.mode = config['mode']
+        args.length = config['length']
+
+    mode = None
+    for case in TestCase:
+        if case.name == args.mode:
+            mode = case
+            break
+
+    content = generate_test_content(
+        mode,
+        timeout=args.harakiri,
+        length=args.length,
+    )
+    output = scan_content(
+        content,
+        timeout=args.harakiri,
+        baseline=args.baseline,
+    )
+
+    if not args.baseline:
+        temp = json.loads(output)
+        temp['config'] = {
+            'mode': mode.name,
+            'length': args.length,
+        }
+
+        output = json.dumps(temp)
+
+    print(output)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--harakiri',
+        default=30,
+        type=assert_positive(float),
+        help=(
+            'Specifies an upper bound for number of seconds to wait for '
+            'each test.'
+        ),
+    )
+    parser.add_argument(
+        '--baseline',
+        type=assert_valid_file,
+        help=(
+            'Specifies test config file to run. If this is provided, '
+            'all config options will be referenced from this file.'
+        ),
+    )
+    parser.add_argument(
+        '-m',
+        '--mode',
+        choices=[
+            value.name
+            for value in TestCase
+        ],
+        required=True,
+        help='Specifies the type of content to generate.',
+    )
+    parser.add_argument(
+        '-L',
+        '--length',
+        type=assert_positive(int),
+        help='Length of test case content.',
+    )
+
+    return parser.parse_args()
+
+
+def assert_positive(type):
+    def wrapped(string):
+        value = type(string)
+        if value <= 0:
+            raise argparse.ArgumentTypeError(
+                '{} must be a positive {}.'.format(
+                    string,
+                    type.__name__,
+                ),
+            )
+
+        return value
+
+    return wrapped
+
+
+def assert_valid_file(string):
+    if not os.path.isfile(string):
+        raise argparse.ArgumentTypeError(
+            '{} must be a valid file.'.format(string),
+        )
+
+    with open(string) as f:
+        return json.load(f)
+
+
+def generate_test_content(mode, **kwargs):
+    """
+    :type mode: TestCase
+    :type length: int
+    :type timeout: float
+    """
+    if not kwargs['length']:
+        del kwargs['length']
+
+    print('Generating content...', file=sys.stderr)
+    if mode == TestCase.LONG_FILE:
+        return generate_long_file(**kwargs)
+    elif mode == TestCase.LONG_LINES:
+        return generate_long_lines(**kwargs)
+
+
+def scan_content(content, timeout, baseline=None):
+    """
+    :type content: str
+    :type timeout: float
+    :type baseline: dict|None
+    """
+    args = [
+        'python',
+        os.path.join(
+            get_root_directory(),
+            'scripts/benchmark.py',
+        ),
+        '--harakiri', str(timeout),
+    ]
+
+    with tempfile.NamedTemporaryFile('w') as f:
+        f.write(content)
+
+        print('Running checks...', file=sys.stderr)
+        if not baseline:
+            args.append(f.name)
+            return subprocess.check_output(
+                args,
+                stderr=subprocess.DEVNULL,
+            ).decode('utf-8')
+
+        with tempfile.NamedTemporaryFile('w') as b:
+            b.write(
+                json.dumps({
+                    'filenames': [f.name],
+                    'timings': baseline['timings'],
+                }),
+            )
+            b.seek(0)
+
+            args.append('--baseline')
+            args.append(b.name)
+
+            return subprocess.check_output(
+                args,
+                stderr=subprocess.DEVNULL,
+            ).decode('utf-8')
+
+
+def generate_long_file(length=250000, **kwargs):
+    return generate_content(
+        separator='\n',
+        length=length,
+    )
+
+
+def generate_long_lines(length=250000, **kwargs):
+    return generate_content(
+        separator=' ',
+        length=length,
+    )
+
+
+def generate_content(separator, length):
+    """
+    :type secret: str
+    :type separator: str
+    :type length: int
+    """
+    valid_secrets = {
+        'AWSKeyDetector': 'AKIATESTTESTTESTTEST',
+        'ArtifactoryDetector': ':AKCtestTESTte',
+        'Base64HighEntropyString': 'Y29uZ3JhdHVsYXRpb25zISB0aGlzIGlzIGEgaGlkZGVuIG1lc3NhZ2U=',
+        'BasicAuthDetector': 'http://username:password@example.com',
+        'HexHighEntropyString': '123456abcd',
+        'KeywordDetector': 'api_key = foobar',
+        'PrivateKeyDetector': 'BEGIN PRIVATE KEY',
+        'SlackDetector': 'xoxb-1-test',
+        'StripeDetector': 'rk_live_TESTtestTESTtestTESTtest',
+    }
+
+    with open(
+        os.path.join(
+            get_root_directory(),
+            'test_data/performance/best-songs.txt',
+        ),
+    ) as f:
+        source_material = f.read().splitlines()
+
+    indexes = {}
+    for key in valid_secrets:
+        index = random.randint(0, length - 1)
+        indexes[index] = key
+
+    content = []
+    for line_number in range(length):
+        if line_number in indexes:
+            content.append(valid_secrets[indexes[line_number]])
+        else:
+            random_line = random.randint(0, len(source_material) - 1)
+            content.append(source_material[random_line])
+
+    return separator.join(content)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/test_data/performance/best-songs.txt b/test_data/performance/best-songs.txt
new file mode 100644
index 00000000..bd77eb34
--- /dev/null
+++ b/test_data/performance/best-songs.txt
@@ -0,0 +1,52 @@
+Is this the real life?
+Is this just fantasy?
+Caught in a landslide
+No escape from reality
+Open your eyes
+Look up to the skies and see
+I'm just a poor boy, I need no sympathy
+Because I'm easy come, easy go
+A little high, little low
+Anyway the wind blows, doesn't really matter to me, to me
+Mama, just killed a man
+Put a gun against his head
+Pulled my trigger, now he's dead
+Mama, life had just begun
+But now I've gone and thrown it all away
+Mama, oh oh
+Didn't mean to make you cry
+If I'm not back again this time tomorrow
+Carry on, carry on, as if nothing really matters
+Too late, my time has come
+Sends shivers down my spine
+Body's aching all the time
+Goodbye everybody I've got to go
+Gotta leave you all behind and face the truth
+Mama, oh oh (anyway the wind blows)
+I don't want to die
+Sometimes wish I'd never been born at all
+I see a little silhouetto of a man
+Scaramouch, Scaramouch will you do the Fandango
+Thunderbolt and lightning very very frightening me
+Gallileo, Gallileo, Gallileo, Gallileo, Gallileo, figaro, magnifico
+I'm just a poor boy and nobody loves me
+He's just a poor boy from a poor family
+Spare him his life from this monstrosity
+Easy come easy go will you let me go
+Bismillah, no we will not let you go, let him go
+Bismillah, we will not let you go, let him go
+Bismillah, we will not let you go, let me go
+(Will not let you go) let me go (never, never let you go) let me go (never let me go)
+Oh oh no, no, no, no, no, no, no
+Oh mama mia, mama mia, mama mia let me go
+Beelzebub has a devil put aside for me for me for me
+So you think you can stop me and spit in my eye
+So you think you can love me and leave me to die
+Oh baby can't do this to me baby
+Just gotta get out just gotta get right outta here
+Oh oh oh yeah, oh oh yeah
+Nothing really matters
+Anyone can see
+Nothing really matters
+Nothing really matters to me
+Anyway the wind blows

From 1e29d5ffa028d096367c74774c4cc73c34bbfd51 Mon Sep 17 00:00:00 2001
From: Aaron Loo <domanchi@users.noreply.github.com>
Date: Wed, 29 May 2019 21:01:00 -0700
Subject: [PATCH 3/4] bug fix: handling baseline output when timeout was
 exceeded for benchmark script

---
 scripts/benchmark.py | 67 ++++++++++++++++++++++++++++++++------------
 1 file changed, 49 insertions(+), 18 deletions(-)

diff --git a/scripts/benchmark.py b/scripts/benchmark.py
index 8dbe7038..66f5c21f 100644
--- a/scripts/benchmark.py
+++ b/scripts/benchmark.py
@@ -208,34 +208,50 @@ def print_output(timings, args):
     # Print header
     baseline = args.baseline['timings'] if args.baseline else {}
     if not baseline:
-        print('-' * 40)
+        print('-' * 45)
         print('{:<25s}{:>15s}'.format('plugin', 'time'))
-        print('-' * 40)
+        print('-' * 45)
     else:
-        print('-' * 57)
-        print('{:<25s}{:>13s}{:>16s}'.format('plugin', 'time', 'change'))
-        print('-' * 57)
+        print('-' * 60)
+        print('{:<25s}{:>11s}{:>22s}'.format('plugin', 'time', 'change'))
+        print('-' * 60)
 
     # Print content
     if 'all-plugins' in timings:
         print_line(
             'All Plugins',
-            timings['all-plugins'],
-            baseline.get('all-plugins'),
+            time=timings['all-plugins'],
+            baseline=_get_baseline_value(baseline, 'all-plugins'),
+            timeout=args.harakiri,
         )
         del timings['all-plugins']
 
     for key in sorted(timings):
-        print_line(key, timings[key], baseline.get(key))
+        print_line(
+            key,
+            time=timings[key],
+            baseline=_get_baseline_value(baseline, key),
+            timeout=args.harakiri,
+        )
 
     # Print footer line
     if not args.baseline:
-        print('-' * 40)
+        print('-' * 45)
     else:
-        print('-' * 57)
+        print('-' * 60)
+
+
+def _get_baseline_value(baseline, key):
+    """
+    We need to distinguish between no baseline mode (which should return
+    None as a value), baseline mode with exceeded timeout (which is stored
+    as None, but should return 0).
+    """
+    if key in baseline:
+        return 0 if baseline[key] is None else baseline[key]
 
 
-def print_line(name, time, baseline):
+def print_line(name, time, baseline, timeout):
     """
     :type name: str
 
@@ -244,38 +260,53 @@ def print_line(name, time, baseline):
 
     :type baseline: float
     :param baseline: expected seconds to execute
+
+    :type timeout: float
+    :param timeout: used to calculate difference when either current
+        execution or baseline execution exceeds timeout.
     """
     if not time:
         time_string = 'Timeout exceeded!'
     else:
         time_string = '{}s'.format(str(time))
 
-    if baseline:
-        difference = round(baseline - time, 2)
+    if baseline is not None:
+        if time and baseline:
+            difference = round(baseline - time, 2)
+        elif time:
+            # This handles the case when the baseline execution exceeds timeout
+            difference = round(timeout - time, 2)
+        elif baseline:
+            # This handles the case when this current execution exceeds timeout
+            difference = round(timeout - baseline, 2)
+        else:
+            # They both failed.
+            difference = 0
+
         if difference > 0:
             difference_string = colorize(
                 '▲  {}'.format(difference),
                 AnsiColor.LIGHT_GREEN,
             )
-            difference_string = '{:>24s}'.format(difference_string)
+            difference_string = '{:>22s}'.format(difference_string)
         elif difference < 0:
             difference_string = colorize(
                 '▼ {}'.format(difference),
                 AnsiColor.RED,
             )
-            difference_string = '{:>24s}'.format(difference_string)
+            difference_string = '{:>22s}'.format(difference_string)
         else:
-            difference_string = '{:>12s}'.format('-')
+            difference_string = '{:>10s}'.format('-')
 
         print(
-            '{:<25s}{:>15s}{}'.format(
+            '{:<25s}{:^20s}{}'.format(
                 name,
                 time_string,
                 difference_string,
             ),
         )
     else:
-        print('{:<25s}{:>15s}'.format(name, time_string))
+        print('{:<25s}{:>20s}'.format(name, time_string))
 
 
 if __name__ == '__main__':

From 662948803ab01996faa64e3519024faaaa421a86 Mon Sep 17 00:00:00 2001
From: Aaron Loo <domanchi@users.noreply.github.com>
Date: Wed, 29 May 2019 21:11:44 -0700
Subject: [PATCH 4/4] adding performance test cases

---
 test_data/performance/long-file.test.json  | 21 +++++++++++++++++++++
 test_data/performance/long-lines.test.json | 21 +++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 test_data/performance/long-file.test.json
 create mode 100644 test_data/performance/long-lines.test.json

diff --git a/test_data/performance/long-file.test.json b/test_data/performance/long-file.test.json
new file mode 100644
index 00000000..6ee24f4f
--- /dev/null
+++ b/test_data/performance/long-file.test.json
@@ -0,0 +1,21 @@
+{
+    "config": {
+        "length": null,
+        "mode": "LONG_FILE"
+    },
+    "filenames": [
+        "/tmp/tmpxmbg9rgl"
+    ],
+    "timings": {
+        "AWSKeyDetector": 1.40741,
+        "ArtifactoryDetector": 1.89773,
+        "Base64HighEntropyString": 2.89092,
+        "BasicAuthDetector": 1.35538,
+        "HexHighEntropyString": 2.8728,
+        "KeywordDetector": 2.45626,
+        "PrivateKeyDetector": 1.85466,
+        "SlackDetector": 1.50198,
+        "StripeDetector": 1.40959,
+        "all-plugins": 16.10408
+    }
+}
diff --git a/test_data/performance/long-lines.test.json b/test_data/performance/long-lines.test.json
new file mode 100644
index 00000000..0642d920
--- /dev/null
+++ b/test_data/performance/long-lines.test.json
@@ -0,0 +1,21 @@
+{
+    "config": {
+        "length": null,
+        "mode": "LONG_LINES"
+    },
+    "filenames": [
+        "/tmp/tmp7iwzjvol"
+    ],
+    "timings": {
+        "AWSKeyDetector": 1.33812,
+        "ArtifactoryDetector": 1.95065,
+        "Base64HighEntropyString": 22.52337,
+        "BasicAuthDetector": 1.3325,
+        "HexHighEntropyString": 22.31473,
+        "KeywordDetector": 1.64396,
+        "PrivateKeyDetector": 1.35953,
+        "SlackDetector": 1.41018,
+        "StripeDetector": 1.49091,
+        "all-plugins": null
+    }
+}