From d93e9bd6e35911a570a7ad22c7175dc4ef491480 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 19:18:35 -0700 Subject: [PATCH 01/26] :mortar_board: Standardize `do not` -> `don't` --- CHANGELOG.md | 4 ++-- LICENSE | 6 +++--- README.md | 2 +- detect_secrets/plugins/basic_auth.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c254827b..3687cee1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -275,7 +275,7 @@ This includes using `# pragma: allowlist secret` now for inline allowlisting. #### :telescope: Accuracy -- [Added `null` to the `FALSE_POSITIVES` tuple for the `KeywordDetector` plugin, so we do not alert off of it](https://github.com/Yelp/detect-secrets/commit/58df82ce37d64f22cb885960c2031b5f8ebe4b75) +- [Added `null` to the `FALSE_POSITIVES` tuple for the `KeywordDetector` plugin, so we don't alert off of it](https://github.com/Yelp/detect-secrets/commit/58df82ce37d64f22cb885960c2031b5f8ebe4b75) @@ -286,7 +286,7 @@ This includes using `# pragma: allowlist secret` now for inline allowlisting. - Turned the `KeywordDetector` plugin back on, with new regexes and accuracy improvements ([#86]) - Added an `AWSAccessKeyDetector` plugin ([#100]) -- Added the ability to scan `.ini` types files that do not have a header ([#106]) +- Added the ability to scan `.ini` types files that don't have a header ([#106]) [#86]: https://github.com/Yelp/detect-secrets/pull/86 [#100]: https://github.com/Yelp/detect-secrets/pull/100 diff --git a/LICENSE b/LICENSE index a6962b3b..6db569d4 100644 --- a/LICENSE +++ b/LICENSE @@ -100,13 +100,13 @@ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of + excluding those notices that don't pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not + within such NOTICE file, excluding those notices that don't pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or @@ -114,7 +114,7 @@ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution + don't modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed diff --git a/README.md b/README.md index 785731ef..d9a9e0e2 100644 --- a/README.md +++ b/README.md @@ -178,7 +178,7 @@ committing secrets. ### Things that won't be prevented * Multi-line secrets -* Default passwords that do not trigger the `KeywordDetector` (e.g. `login = "hunter2"`) +* Default passwords that don't trigger the `KeywordDetector` (e.g. `login = "hunter2"`) ### Plugin Configuration diff --git a/detect_secrets/plugins/basic_auth.py b/detect_secrets/plugins/basic_auth.py index 020f131e..29a33b15 100644 --- a/detect_secrets/plugins/basic_auth.py +++ b/detect_secrets/plugins/basic_auth.py @@ -7,7 +7,7 @@ # This list is derived from RFC 3986 Section 2.2. # -# We do not expect any of these delimiter characters to appear in +# We don't expect any of these delimiter characters to appear in # the username/password component of the URL, seeing that this would probably # result in an unexpected URL parsing (and probably won't even work). RESERVED_CHARACTERS = ':/?#[]@' From 5b66a26d050f410fddb2c8532e4a564d7c879cac Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 19:19:19 -0700 Subject: [PATCH 02/26] :snake: Refactor awkward [0] to a short-circuit In get_raw_secret_value() --- detect_secrets/core/audit.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/detect_secrets/core/audit.py b/detect_secrets/core/audit.py index 949bc343..f3ad5b27 100644 --- a/detect_secrets/core/audit.py +++ b/detect_secrets/core/audit.py @@ -657,13 +657,9 @@ def get_raw_secret_value( plugin_secrets = plugin.analyze(file_handle, filename) - matching_secret = [ - plugin_secret.secret_value - for plugin_secret in plugin_secrets - if plugin_secret.secret_hash == secret['hashed_secret'] - ] - - if not matching_secret: - raise SecretNotFoundOnSpecifiedLineError(secret['line_number']) + # Return value of matching secret + for plugin_secret in plugin_secrets: + if plugin_secret.secret_hash == secret['hashed_secret']: + return plugin_secret.secret_value - return matching_secret[0] + raise SecretNotFoundOnSpecifiedLineError(secret['line_number']) From bb543c5b20372f507ae0f99f7d01872f66db3a83 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 19:38:08 -0700 Subject: [PATCH 03/26] :bug: We were doing e.g. '.svg' == 'svg' `os.path.splitext(filename)[1]` includes the '.' --- detect_secrets/core/constants.py | 51 +++++++++++++++------------ tests/core/secrets_collection_test.py | 8 +++++ 2 files changed, 36 insertions(+), 23 deletions(-) diff --git a/detect_secrets/core/constants.py b/detect_secrets/core/constants.py index fe59c77d..2fb5ca6a 100644 --- a/detect_secrets/core/constants.py +++ b/detect_secrets/core/constants.py @@ -7,29 +7,34 @@ # and look for "ASCII text", but that might be more expensive. # # Definitely something to look into, if this list gets unruly long. -IGNORED_FILE_EXTENSIONS = { - '7z', - 'bmp', - 'bz2', - 'dmg', - 'exe', - 'gif', - 'gz', - 'ico', - 'jar', - 'jpg', - 'jpeg', - 'png', - 'rar', - 'realm', - 's7z', - 'svg', - 'tar', - 'tif', - 'tiff', - 'webp', - 'zip', -} +IGNORED_FILE_EXTENSIONS = set( + ( + '.' + extension + for extension in ( + '7z', + 'bmp', + 'bz2', + 'dmg', + 'exe', + 'gif', + 'gz', + 'ico', + 'jar', + 'jpg', + 'jpeg', + 'png', + 'rar', + 'realm', + 's7z', + 'svg', + 'tar', + 'tif', + 'tiff', + 'webp', + 'zip', + ) + ) +) class VerifiedResult(Enum): diff --git a/tests/core/secrets_collection_test.py b/tests/core/secrets_collection_test.py index 3cf5cdd5..b43e6e3b 100644 --- a/tests/core/secrets_collection_test.py +++ b/tests/core/secrets_collection_test.py @@ -54,6 +54,14 @@ def test_file_is_symbolic_link(self): assert not logic.scan_file('does_not_matter') + def test_skip_ignored_file_extensions(self): + logic = secrets_collection_factory( + plugins=(MockPluginFixedValue(),), + ) + with mock_open('junk text here, as it does not matter'): + skipped_extension = '.svg' + assert not logic.scan_file('some' + skipped_extension) + def test_error_reading_file(self, mock_log): logic = secrets_collection_factory() From f10cce89321f27db98618fbbe52fdd43ef49adf2 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 19:44:02 -0700 Subject: [PATCH 04/26] :snake: Refactor `for...if...return` with `any` --- detect_secrets/plugins/common/filters.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/detect_secrets/plugins/common/filters.py b/detect_secrets/plugins/common/filters.py index 3a5e079d..a6c02b2f 100644 --- a/detect_secrets/plugins/common/filters.py +++ b/detect_secrets/plugins/common/filters.py @@ -7,13 +7,13 @@ def is_false_positive(secret): - for func in [ - is_sequential_string, - ]: - if func(secret): - return True - - return False + return any( + func(secret) + for func in + ( + is_sequential_string, + ) + ) def is_sequential_string(secret): From bc4f922d2084da4d715f23206bec39f7794a8ebf Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:23:52 -0700 Subject: [PATCH 05/26] :snake: Remove is_false_positive from RegexBasedDetector The current heuristic will never return True --- detect_secrets/plugins/base.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index 22f0ad18..f3e0005c 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -4,7 +4,6 @@ from abc import abstractproperty from .common.constants import ALLOWLIST_REGEXES -from .common.filters import is_false_positive from detect_secrets.core.code_snippet import CodeSnippetHighlighter from detect_secrets.core.constants import VerifiedResult from detect_secrets.core.potential_secret import PotentialSecret @@ -235,7 +234,4 @@ def analyze_string_content(self, string, line_num, filename): def secret_generator(self, string, *args, **kwargs): for regex in self.denylist: for match in regex.findall(string): - if is_false_positive(match): - continue - yield match From 0b2c0e119c43361a31ef343b8b0587cadddc3584 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:25:26 -0700 Subject: [PATCH 06/26] :mortar_board: Standardize comments Start capitalized and after 2 spaces when on the same line as code --- detect_secrets/core/bidirectional_iterator.py | 2 +- detect_secrets/plugins/artifactory.py | 8 ++++---- detect_secrets/plugins/aws.py | 2 +- detect_secrets/plugins/common/constants.py | 4 ++-- detect_secrets/plugins/common/ini_file_parser.py | 12 ++++++------ detect_secrets/plugins/stripe.py | 2 +- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/detect_secrets/core/bidirectional_iterator.py b/detect_secrets/core/bidirectional_iterator.py index f3f573bc..b664a99c 100644 --- a/detect_secrets/core/bidirectional_iterator.py +++ b/detect_secrets/core/bidirectional_iterator.py @@ -1,7 +1,7 @@ class BidirectionalIterator(object): def __init__(self, collection): self.collection = collection - self.index = -1 # starts on -1, as index is increased _before_ getting result + self.index = -1 # Starts on -1, as index is increased _before_ getting result self.step_back_once = False def __next__(self): diff --git a/detect_secrets/plugins/artifactory.py b/detect_secrets/plugins/artifactory.py index b0a72870..bada92b9 100644 --- a/detect_secrets/plugins/artifactory.py +++ b/detect_secrets/plugins/artifactory.py @@ -10,8 +10,8 @@ class ArtifactoryDetector(RegexBasedDetector): secret_type = 'Artifactory Credentials' denylist = [ - # artifactory tokens begin with AKC - re.compile(r'(?:\s|=|:|"|^)AKC[a-zA-Z0-9]{10,}'), # api token - # artifactory encrypted passwords begin with AP[A-Z] - re.compile(r'(?:\s|=|:|"|^)AP[\dABCDEF][a-zA-Z0-9]{8,}'), # password + # Artifactory tokens begin with AKC + re.compile(r'(?:\s|=|:|"|^)AKC[a-zA-Z0-9]{10,}'), # API token + # Artifactory encrypted passwords begin with AP[A-Z] + re.compile(r'(?:\s|=|:|"|^)AP[\dABCDEF][a-zA-Z0-9]{8,}'), # Password ] diff --git a/detect_secrets/plugins/aws.py b/detect_secrets/plugins/aws.py index 84c94051..f99d1d06 100644 --- a/detect_secrets/plugins/aws.py +++ b/detect_secrets/plugins/aws.py @@ -176,7 +176,7 @@ def verify_aws_secret_access_key(key, secret): # pragma: no cover return True -def _sign(key, message, hex=False): # pragma: no cover +def _sign(key, message, hex=False): # pragma: no cover value = hmac.new(key, message.encode('utf-8'), hashlib.sha256) if not hex: return value.digest() diff --git a/detect_secrets/plugins/common/constants.py b/detect_secrets/plugins/common/constants.py index d96aacdf..9efc10e2 100644 --- a/detect_secrets/plugins/common/constants.py +++ b/detect_secrets/plugins/common/constants.py @@ -23,9 +23,9 @@ ] ] -# add to this mapping (and ALLOWLIST_REGEXES if applicable) lazily, +# Add to this mapping (and ALLOWLIST_REGEXES if applicable) lazily, # as more language specific file parsers are implemented. -# discussion: https://github.com/Yelp/detect-secrets/pull/105 +# Discussion: https://github.com/Yelp/detect-secrets/pull/105 ALLOWLIST_REGEX = { 'yaml': ALLOWLIST_REGEXES[0], } diff --git a/detect_secrets/plugins/common/ini_file_parser.py b/detect_secrets/plugins/common/ini_file_parser.py index e4274459..6c828cee 100644 --- a/detect_secrets/plugins/common/ini_file_parser.py +++ b/detect_secrets/plugins/common/ini_file_parser.py @@ -38,10 +38,10 @@ def __init__(self, file, add_header=False, exclude_lines_regex=None): """ self.parser = configparser.ConfigParser() try: - # python2.7 compatible + # Python2.7 compatible self.parser.optionxform = unicode except NameError: # pragma: no cover - # python3 compatible + # Python3 compatible self.parser.optionxform = str self.exclude_lines_regex = exclude_lines_regex @@ -53,10 +53,10 @@ def __init__(self, file, add_header=False, exclude_lines_regex=None): content = '[global]\n' + content try: - # python2.7 compatible + # Python2.7 compatible self.parser.read_string(unicode(content)) except NameError: # pragma: no cover - # python3 compatible + # Python3 compatible self.parser.read_string(content) # Hacky way to keep track of line location @@ -135,7 +135,7 @@ def _get_value_and_line_offset(self, key, values): if current_value_list_index == len(values_list): if index == 0: - index = 1 # don't want to count the same line again + index = 1 # Don't want to count the same line again self.line_offset += index self.lines = self.lines[index:] lines_modified = True @@ -163,7 +163,7 @@ def _construct_values_list(values): >>> key = value0 ... value1 ... - ... # comment line here + ... # Comment line here ... value2 given that normally, either value0 is supplied, or (value1, value2), diff --git a/detect_secrets/plugins/stripe.py b/detect_secrets/plugins/stripe.py index 9eda09a2..2c94420f 100644 --- a/detect_secrets/plugins/stripe.py +++ b/detect_secrets/plugins/stripe.py @@ -13,6 +13,6 @@ class StripeDetector(RegexBasedDetector): secret_type = 'Stripe Access Key' denylist = ( - # stripe standard keys begin with sk_live and restricted with rk_live + # Stripe standard keys begin with sk_live and restricted with rk_live re.compile(r'(?:r|s)k_live_[0-9a-zA-Z]{24}'), ) From f8cb31fd4d4cd8a5b1e1a5f2846961ded80553b5 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:47:26 -0700 Subject: [PATCH 07/26] :tada: Add --word-list option - Add `pyahocorasick` as an optional dependency See issue #240 for more information. --- detect_secrets/core/baseline.py | 14 ++++- detect_secrets/core/secrets_collection.py | 29 +++++++++- detect_secrets/core/usage.py | 22 ++++++- detect_secrets/main.py | 38 +++++++++++-- detect_secrets/plugins/base.py | 33 ++++++++++- detect_secrets/plugins/common/filters.py | 57 ++++++++++++++++--- detect_secrets/plugins/common/initialize.py | 33 +++++++++-- .../plugins/high_entropy_strings.py | 42 ++++++-------- detect_secrets/plugins/keyword.py | 13 +++-- detect_secrets/pre_commit_hook.py | 12 +++- detect_secrets/util.py | 49 +++++++++++++++- requirements-dev.txt | 1 + setup.py | 3 + test_data/word_list.txt | 1 + testing/factories.py | 19 ++++++- tests/core/secrets_collection_test.py | 57 +++++++++++++++++-- tests/main_test.py | 8 +++ tests/plugins/common/filters_test.py | 10 ++-- tests/plugins/high_entropy_strings_test.py | 2 + tests/plugins/keyword_test.py | 24 ++++++++ tests/pre_commit_hook_test.py | 11 +++- tests/util_test.py | 23 ++++++++ 22 files changed, 425 insertions(+), 76 deletions(-) create mode 100644 test_data/word_list.txt diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py index bff8da11..6e573ac9 100644 --- a/detect_secrets/core/baseline.py +++ b/detect_secrets/core/baseline.py @@ -18,17 +18,27 @@ def initialize( plugins, exclude_files_regex=None, exclude_lines_regex=None, + word_list_file=None, + word_list_hash=None, should_scan_all_files=False, ): """Scans the entire codebase for secrets, and returns a SecretsCollection object. + :type path: list + :type plugins: tuple of detect_secrets.plugins.base.BasePlugin :param plugins: rules to initialize the SecretsCollection with. :type exclude_files_regex: str|None :type exclude_lines_regex: str|None - :type path: list + + :type word_list_file: str|None + :param word_list_file: optional word list file for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. + :type should_scan_all_files: bool :rtype: SecretsCollection @@ -37,6 +47,8 @@ def initialize( plugins, exclude_files=exclude_files_regex, exclude_lines=exclude_lines_regex, + word_list_file=word_list_file, + word_list_hash=word_list_hash, ) files_to_scan = [] diff --git a/detect_secrets/core/secrets_collection.py b/detect_secrets/core/secrets_collection.py index 9266dd15..8ef68ae2 100644 --- a/detect_secrets/core/secrets_collection.py +++ b/detect_secrets/core/secrets_collection.py @@ -12,6 +12,7 @@ from detect_secrets.core.log import log from detect_secrets.core.potential_secret import PotentialSecret from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton class SecretsCollection(object): @@ -21,6 +22,8 @@ def __init__( plugins=(), exclude_files=None, exclude_lines=None, + word_list_file=None, + word_list_hash=None, ): """ :type plugins: tuple of detect_secrets.plugins.base.BasePlugin @@ -32,14 +35,18 @@ def __init__( :type exclude_lines: str|None :param exclude_lines: optional regex for ignored lines. - :type version: str - :param version: version of detect-secrets that SecretsCollection - is valid at. + :type word_list_file: str|None + :param word_list_file: optional word list file for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. """ self.data = {} self.plugins = plugins self.exclude_files = exclude_files self.exclude_lines = exclude_lines + self.word_list_file = word_list_file + self.word_list_hash = word_list_hash self.version = VERSION @classmethod @@ -93,6 +100,17 @@ def load_baseline_from_dict(cls, data): result.exclude_files = data['exclude']['files'] result.exclude_lines = data['exclude']['lines'] + # In v0.12.7 the `--word-list` option got added + automaton = None + if 'word_list' in data: + result.word_list_file = data['word_list']['file'] + result.word_list_hash = data['word_list']['hash'] + + if result.word_list_file: + # Always ignore the given `data['word_list']['hash']` + # The difference will show whenever the word list changes + automaton, result.word_list_hash = build_automaton(result.word_list_file) + plugins = [] for plugin in data['plugins_used']: plugin_classname = plugin.pop('name') @@ -100,6 +118,7 @@ def load_baseline_from_dict(cls, data): initialize.from_plugin_classname( plugin_classname, exclude_lines_regex=result.exclude_lines, + automaton=automaton, should_verify_secrets=False, **plugin ), @@ -277,6 +296,10 @@ def format_for_baseline_output(self): 'files': self.exclude_files, 'lines': self.exclude_lines, }, + 'word_list': { + 'file': self.word_list_file, + 'hash': self.word_list_hash, + }, 'plugins_used': plugins_used, 'results': results, 'version': self.version, diff --git a/detect_secrets/core/usage.py b/detect_secrets/core/usage.py index c66b2cf0..3e933d7a 100644 --- a/detect_secrets/core/usage.py +++ b/detect_secrets/core/usage.py @@ -14,6 +14,18 @@ def add_exclude_lines_argument(parser): ) +def add_word_list_argument(parser): + parser.add_argument( + '--word-list', + type=str, + help=( + 'Text file with a list of words, ' + 'if a secret contains a word in the list we ignore it.' + ), + dest='word_list_file', + ) + + def add_use_all_plugins_argument(parser): parser.add_argument( '--use-all-plugins', @@ -46,6 +58,7 @@ def add_pre_commit_arguments(self): self._add_filenames_argument()\ ._add_set_baseline_argument()\ ._add_exclude_lines_argument()\ + ._add_word_list_argument()\ ._add_use_all_plugins_argument()\ ._add_no_verify_flag() @@ -108,6 +121,10 @@ def _add_exclude_lines_argument(self): add_exclude_lines_argument(self.parser) return self + def _add_word_list_argument(self): + add_word_list_argument(self.parser) + return self + def _add_use_all_plugins_argument(self): add_use_all_plugins_argument(self.parser) return self @@ -143,9 +160,10 @@ def _add_initialize_baseline_argument(self): ), ) - # Pairing `--exclude-lines` to both pre-commit and `--scan` - # because it can be used for both. + # Pairing `--exclude-lines` and `--word-list` to + # both pre-commit and `--scan` because it can be used for both. add_exclude_lines_argument(self.parser) + add_word_list_argument(self.parser) # Pairing `--exclude-files` with `--scan` because it's only used for the initialization. # The pre-commit hook framework already has an `exclude` option that can diff --git a/detect_secrets/main.py b/detect_secrets/main.py index f6f0d596..c939bfd2 100644 --- a/detect_secrets/main.py +++ b/detect_secrets/main.py @@ -13,6 +13,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.core.usage import ParserBuilder from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton def parse_args(argv): @@ -30,11 +31,17 @@ def main(argv=None): log.set_debug_level(args.verbose) if args.action == 'scan': + automaton = None + word_list_hash = None + if args.word_list_file: + automaton, word_list_hash = build_automaton(args.word_list_file) + # Plugins are *always* rescanned with fresh settings, because # we want to get the latest updates. plugins = initialize.from_parser_builder( args.plugins, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) if args.string: @@ -46,7 +53,12 @@ def main(argv=None): _scan_string(line, plugins) else: - baseline_dict = _perform_scan(args, plugins) + baseline_dict = _perform_scan( + args, + plugins, + automaton, + word_list_hash, + ) if args.import_filename: write_baseline_to_file( @@ -87,7 +99,7 @@ def main(argv=None): return 0 -def _get_plugin_from_baseline(old_baseline): +def _get_plugins_from_baseline(old_baseline): plugins = [] if old_baseline and 'plugins_used' in old_baseline: secrets_collection = SecretsCollection.load_baseline_from_dict(old_baseline) @@ -114,17 +126,25 @@ def _scan_string(line, plugins): print('\n'.join(sorted(output))) -def _perform_scan(args, plugins): +def _perform_scan(args, plugins, automaton, word_list_hash): """ :param args: output of `argparse.ArgumentParser.parse_args` :param plugins: tuple of initialized plugins + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :type word_list_hash: str|None + :param word_list_hash: optional iterated sha1 hash of the words in the word list. + :rtype: dict """ old_baseline = _get_existing_baseline(args.import_filename) if old_baseline: - plugins = initialize.merge_plugin_from_baseline( - _get_plugin_from_baseline(old_baseline), args, + plugins = initialize.merge_plugins_from_baseline( + _get_plugins_from_baseline(old_baseline), + args, + automaton=automaton, ) # Favors `--exclude-files` and `--exclude-lines` CLI arguments @@ -139,6 +159,12 @@ def _perform_scan(args, plugins): ): args.exclude_lines = old_baseline['exclude']['lines'] + if ( + not args.word_list_file + and old_baseline.get('word_list') + ): + args.word_list_file = old_baseline['word_list']['file'] + # If we have knowledge of an existing baseline file, we should use # that knowledge and add it to our exclude_files regex. if args.import_filename: @@ -148,6 +174,8 @@ def _perform_scan(args, plugins): plugins=plugins, exclude_files_regex=args.exclude_files, exclude_lines_regex=args.exclude_lines, + word_list_file=args.word_list_file, + word_list_hash=word_list_hash, path=args.path, should_scan_all_files=args.all_files, ).format_for_baseline_output() diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index f3e0005c..d4695fec 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -148,7 +148,11 @@ def adhoc_scan(self, string): : """ # TODO: Handle multiple secrets on single line. - results = self.analyze_string(string, 0, 'does_not_matter') + results = self.analyze_string( + string, + line_num=0, + filename='does_not_matter', + ) if not results: return 'False' @@ -193,7 +197,7 @@ def __dict__(self): class RegexBasedDetector(BasePlugin): - """Base class for regular-expression based detectors. + """Parent class for regular-expression based detectors. To create a new regex-based detector, subclass this and set `secret_type` with a description and `denylist` @@ -235,3 +239,28 @@ def secret_generator(self, string, *args, **kwargs): for regex in self.denylist: for match in regex.findall(string): yield match + + +class WordListSupportedDetector(BasePlugin): + """Parent class for detectors supporting a word list. + + To create a new word list supported detector, subclass this + and pass `automaton` in __init__ like: + + class BarDetector(WordListSupportedDetector): + + secret_type = "bar" + + def __init__(self, automaton=None, **kwargs): + super(BarDetector, self).__init__( + automaton, + **kwargs + ) + ... + """ + __metaclass__ = ABCMeta + + def __init__(self, automaton=None, **kwargs): + super(WordListSupportedDetector, self).__init__(**kwargs) + + self.automaton = automaton diff --git a/detect_secrets/plugins/common/filters.py b/detect_secrets/plugins/common/filters.py index a6c02b2f..e0fd0ebf 100644 --- a/detect_secrets/plugins/common/filters.py +++ b/detect_secrets/plugins/common/filters.py @@ -1,27 +1,66 @@ """ -Heuristic, false positive filters that are shared across all plugin types. +False positive heuristic filters that are shared across all plugin types. This abstraction allows for development of later ML work, or further heuristical determinations (e.g. word filter, entropy comparator). """ import string +from detect_secrets.util import is_python_2 -def is_false_positive(secret): + +def is_false_positive(secret, automaton): + """ + :type secret: str + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :rtype: bool + Returns True if any false positive heuristic function returns True. + """ return any( - func(secret) + func(secret, automaton) for func in ( - is_sequential_string, + _is_found_with_aho_corasick, + _is_sequential_string, ) ) -def is_sequential_string(secret): +def _is_found_with_aho_corasick(secret, automaton): + """ + :type secret: str + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + :rtype: bool + Returns True if secret contains a word in the automaton. """ - Returns true if string is sequential. + if not automaton: + return False + + if is_python_2(): # pragma: no cover + # Due to pyahocorasick + secret = secret.encode('utf-8') + + try: + next(automaton.iter(string=secret)) + return True + except StopIteration: + return False + + +def _is_sequential_string(secret, *args): + """ + :type secret: str + + :rtype: bool + Returns True if string is sequential. """ sequences = ( - # base64 letters first + # Base64 letters first ( string.ascii_uppercase + string.ascii_uppercase + @@ -29,7 +68,7 @@ def is_sequential_string(secret): '+/' ), - # base64 numbers first + # Base64 numbers first ( string.digits + string.ascii_uppercase + @@ -41,7 +80,7 @@ def is_sequential_string(secret): # sequences, since those will happen to be caught by the # base64 checks. - # alphanumeric sequences + # Alphanumeric sequences (string.digits + string.ascii_uppercase) * 2, # Capturing any number sequences diff --git a/detect_secrets/plugins/common/initialize.py b/detect_secrets/plugins/common/initialize.py index d051c8ee..fb8696d0 100644 --- a/detect_secrets/plugins/common/initialize.py +++ b/detect_secrets/plugins/common/initialize.py @@ -19,6 +19,7 @@ def from_parser_builder( plugins_dict, exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, ): """ @@ -28,16 +29,21 @@ def from_parser_builder( :type exclude_lines_regex: str|None :param exclude_lines_regex: optional regex for ignored lines. + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + :type should_verify_secrets: bool :returns: tuple of initialized plugins """ output = [] + for plugin_name in plugins_dict: output.append( from_plugin_classname( plugin_name, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, should_verify_secrets=should_verify_secrets, **plugins_dict[plugin_name] ), @@ -65,7 +71,7 @@ def _get_prioritized_parameters(plugins_dict, is_using_default_value_map, prefer yield plugin_name, param_name, param_value -def merge_plugin_from_baseline(baseline_plugins, args): +def merge_plugins_from_baseline(baseline_plugins, args, automaton): """ :type baseline_plugins: tuple of BasePlugin :param baseline_plugins: BasePlugin instances from baseline file @@ -73,7 +79,10 @@ def merge_plugin_from_baseline(baseline_plugins, args): :type args: dict :param args: dictionary of arguments parsed from usage - param priority: input param > baseline param > default + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring certain words. + + param priority is input param > baseline param > default :returns: tuple of initialized plugins """ @@ -89,10 +98,10 @@ def _remove_key(d, key): # Use input plugin as starting point if args.use_all_plugins: - # input param and default param are used + # Input param and default param are used plugins_dict = dict(args.plugins) - # baseline param priority > default + # Baseline param priority > default for plugin_name, param_name, param_value in _get_prioritized_parameters( baseline_plugins_dict, args.is_using_default_value, @@ -109,6 +118,7 @@ def _remove_key(d, key): return from_parser_builder( plugins_dict, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) @@ -120,7 +130,7 @@ def _remove_key(d, key): if plugin_name not in disabled_plugins } - # input param priority > baseline + # Input param priority > baseline input_plugins_dict = dict(args.plugins) for plugin_name, param_name, param_value in _get_prioritized_parameters( input_plugins_dict, @@ -139,12 +149,14 @@ def _remove_key(d, key): return from_parser_builder( plugins_dict, exclude_lines_regex=args.exclude_lines, + automaton=automaton, ) def from_plugin_classname( plugin_classname, exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, **kwargs ): @@ -155,6 +167,11 @@ def from_plugin_classname( :type exclude_lines_regex: str|None :param exclude_lines_regex: optional regex for ignored lines. + + :type automaton: ahocorasick.Automaton|None + :param automaton: optional automaton for ignoring English-words. + + :type should_verify_secrets: bool """ klass = globals()[plugin_classname] @@ -165,6 +182,7 @@ def from_plugin_classname( try: instance = klass( exclude_lines_regex=exclude_lines_regex, + automaton=automaton, should_verify=should_verify_secrets, **kwargs ) @@ -207,7 +225,10 @@ def from_secret_type(secret_type, settings): return from_plugin_classname( classname, - # `audit` doesn't need verification + # `audit` does not need to + # perform exclusion, filtering or verification + exclude_lines_regex=None, + automaton=None, should_verify_secrets=False, **plugin_init_vars diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index 7262db60..82aa3c73 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -6,7 +6,6 @@ import configparser import base64 import math -import os import re import string from abc import ABCMeta @@ -15,27 +14,23 @@ import yaml -from .base import BasePlugin +from .base import WordListSupportedDetector +from .common.filetype import determine_file_type +from .common.filetype import FileType from .common.filters import is_false_positive from .common.ini_file_parser import IniFileParser from .common.yaml_file_parser import YamlFileParser from detect_secrets.core.potential_secret import PotentialSecret -YAML_EXTENSIONS = ( - '.yaml', - '.yml', -) - - -class HighEntropyStringsPlugin(BasePlugin): +class HighEntropyStringsPlugin(WordListSupportedDetector): """Base class for string pattern matching""" __metaclass__ = ABCMeta secret_type = 'High Entropy String' - def __init__(self, charset, limit, exclude_lines_regex, *args): + def __init__(self, charset, limit, exclude_lines_regex, automaton, *args): if limit < 0 or limit > 8: raise ValueError( 'The limit set for HighEntropyStrings must be between 0.0 and 8.0', @@ -47,6 +42,7 @@ def __init__(self, charset, limit, exclude_lines_regex, *args): super(HighEntropyStringsPlugin, self).__init__( exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) def analyze(self, file, filename): @@ -95,7 +91,7 @@ def analyze_string_content(self, string, line_num, filename): output = {} for result in self.secret_generator(string): - if is_false_positive(result): + if is_false_positive(result, self.automaton): continue secret = PotentialSecret(self.secret_type, filename, result, line_num) @@ -135,22 +131,16 @@ def adhoc_scan(self, string): return output @contextmanager - def non_quoted_string_regex(self, strict=True): + def non_quoted_string_regex(self): """For certain file formats, strings need not necessarily follow the normal convention of being denoted by single or double quotes. In these cases, we modify the regex accordingly. Public, because detect_secrets.core.audit needs to reference it. - - :type strict: bool - :param strict: if True, the regex will match the entire string. """ old_regex = self.regex - regex_alternative = r'([{}]+)'.format(re.escape(self.charset)) - if strict: - regex_alternative = r'^' + regex_alternative + r'$' - + regex_alternative = r'^([{}]+)$'.format(re.escape(self.charset)) self.regex = re.compile(regex_alternative) try: @@ -187,7 +177,7 @@ def _analyze_yaml_file(self, file, filename): """ :returns: same format as super().analyze() """ - if os.path.splitext(filename)[1] not in YAML_EXTENSIONS: + if determine_file_type(filename) != FileType.YAML: # The yaml parser is pretty powerful. It eagerly # parses things when it's not even a yaml file. Therefore, # we use this heuristic to quit early if appropriate. @@ -210,9 +200,11 @@ def _analyze_yaml_file(self, file, filename): if '__line__' in item and item['__line__'] not in ignored_lines: # An isinstance check doesn't work in py2 # so we need the __is_binary__ field. - string_to_scan = self.decode_binary(item['__value__']) \ - if item['__is_binary__'] \ + string_to_scan = ( + self.decode_binary(item['__value__']) + if item['__is_binary__'] else item['__value__'] + ) secrets = self.analyze_string( string_to_scan, @@ -276,11 +268,12 @@ class HexHighEntropyString(HighEntropyStringsPlugin): secret_type = 'Hex High Entropy String' - def __init__(self, hex_limit, exclude_lines_regex=None, **kwargs): + def __init__(self, hex_limit, exclude_lines_regex=None, automaton=None, **kwargs): super(HexHighEntropyString, self).__init__( charset=string.hexdigits, limit=hex_limit, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) @property @@ -334,11 +327,12 @@ class Base64HighEntropyString(HighEntropyStringsPlugin): secret_type = 'Base64 High Entropy String' - def __init__(self, base64_limit, exclude_lines_regex=None, **kwargs): + def __init__(self, base64_limit, exclude_lines_regex=None, automaton=None, **kwargs): super(Base64HighEntropyString, self).__init__( charset=string.ascii_letters + string.digits + '+/=', limit=base64_limit, exclude_lines_regex=exclude_lines_regex, + automaton=automaton, ) @property diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py index e9cca6bc..d7526937 100644 --- a/detect_secrets/plugins/keyword.py +++ b/detect_secrets/plugins/keyword.py @@ -28,9 +28,10 @@ import re -from .base import BasePlugin +from .base import WordListSupportedDetector from .common.filetype import determine_file_type from .common.filetype import FileType +from .common.filters import is_false_positive from detect_secrets.core.potential_secret import PotentialSecret @@ -246,16 +247,18 @@ } -class KeywordDetector(BasePlugin): +class KeywordDetector(WordListSupportedDetector): """This checks if denylisted keywords are present in the analyzed string. """ secret_type = 'Secret Keyword' - def __init__(self, keyword_exclude=None, exclude_lines_regex=None, **kwargs): + def __init__(self, keyword_exclude=None, exclude_lines_regex=None, automaton=None, **kwargs): super(KeywordDetector, self).__init__( - exclude_lines_regex, + exclude_lines_regex=exclude_lines_regex, + automaton=automaton, + **kwargs ) self.keyword_exclude = None @@ -276,6 +279,8 @@ def analyze_string_content(self, string, line_num, filename): string, filetype=determine_file_type(filename), ): + if is_false_positive(identifier, self.automaton): + continue secret = PotentialSecret( self.secret_type, filename, diff --git a/detect_secrets/pre_commit_hook.py b/detect_secrets/pre_commit_hook.py index 871be44a..0543f02c 100644 --- a/detect_secrets/pre_commit_hook.py +++ b/detect_secrets/pre_commit_hook.py @@ -13,6 +13,7 @@ from detect_secrets.core.secrets_collection import SecretsCollection from detect_secrets.core.usage import ParserBuilder from detect_secrets.plugins.common import initialize +from detect_secrets.util import build_automaton log = get_logger(format_string='%(message)s') @@ -37,17 +38,24 @@ def main(argv=None): # Error logs handled within logic. return 1 + automaton = None + word_list_hash = None + if args.word_list_file: + automaton, word_list_hash = build_automaton(args.word_list_file) + plugins = initialize.from_parser_builder( args.plugins, exclude_lines_regex=args.exclude_lines, + automaton=automaton, should_verify_secrets=not args.no_verify, ) - # Merge plugin from baseline + # Merge plugins from baseline if baseline_collection: - plugins = initialize.merge_plugin_from_baseline( + plugins = initialize.merge_plugins_from_baseline( baseline_collection.plugins, args, + automaton, ) baseline_collection.plugins = plugins diff --git a/detect_secrets/util.py b/detect_secrets/util.py index b11c30d5..bb8f688c 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -1,8 +1,51 @@ +import hashlib import os import subprocess +import sys -def get_root_directory(): # pragma: no cover +def is_python_2(): + return sys.version_info[0] < 3 + + +def build_automaton(word_list): + """ + :type word_list: str + :param word_list: optional word list file for ignoring certain words. + + :rtype: (ahocorasick.Automaton, str) + :returns: an automaton, and an iterated sha1 hash of the words in the word list. + """ + # Dynamic import due to optional-dependency + try: + import ahocorasick + except ImportError: # pragma: no cover + print('Please install the `pyahocorasick` package to use --word-list') + raise + + # See https://pyahocorasick.readthedocs.io/en/latest/ + # for more information. + automaton = ahocorasick.Automaton() + word_list_hash = '' + + with open(word_list) as f: + for line in f: + line = line.strip() + if line: + word_list_hash = hashlib.sha1( + (word_list_hash + line).encode('utf-8'), + ).hexdigest() + automaton.add_word(line, line) + + automaton.make_automaton() + + return ( + automaton, + word_list_hash, + ) + + +def get_root_directory(): # pragma: no cover return os.path.realpath( os.path.join( os.path.dirname(__file__), @@ -19,7 +62,7 @@ def get_relative_path(root, path): def get_git_sha(path): - """Returns the sha of the git checkout at the input path + """Returns the sha of the git checkout at the input path. :type path: str :param path: directory of the git checkout @@ -40,7 +83,7 @@ def get_git_sha(path): def get_git_remotes(path): """Returns a list of unique git remotes of the checkout - at the input path + at the input path. :type path: str :param path: directory of the git checkout diff --git a/requirements-dev.txt b/requirements-dev.txt index 2f6cc593..c666ed3f 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,6 +3,7 @@ flake8==3.5.0 mock monotonic pre-commit==1.11.2 +pyahocorasick pytest pyyaml tox>=3.8 diff --git a/setup.py b/setup.py index 76c1cda8..1fd9e554 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,9 @@ 'future', 'functools32', ], + 'word_list': [ + 'pyahocorasick', + ], }, entry_points={ 'console_scripts': [ diff --git a/test_data/word_list.txt b/test_data/word_list.txt new file mode 100644 index 00000000..7c3fc6a2 --- /dev/null +++ b/test_data/word_list.txt @@ -0,0 +1 @@ +c3VwZXIg diff --git a/testing/factories.py b/testing/factories.py index 6c3ea046..db58cee6 100644 --- a/testing/factories.py +++ b/testing/factories.py @@ -4,18 +4,29 @@ from detect_secrets.core.secrets_collection import SecretsCollection -def potential_secret_factory(type_='type', filename='filename', secret='secret', lineno=1): +def potential_secret_factory( + type_='type', + filename='filename', + secret='secret', + lineno=1, +): """This is only marginally better than creating PotentialSecret objects directly, because of the default values. """ return PotentialSecret(type_, filename, secret, lineno) -def secrets_collection_factory(secrets=None, plugins=(), exclude_files_regex=None): +def secrets_collection_factory( + secrets=None, + plugins=(), + exclude_files_regex=None, + word_list_file=None, + word_list_hash=None, +): """ :type secrets: list(dict) :param secrets: list of params to pass to add_secret. - E.g. [ {'secret': 'blah'}, ] + e.g. [ {'secret': 'blah'}, ] :type plugins: tuple :type exclude_files_regex: str|None @@ -25,6 +36,8 @@ def secrets_collection_factory(secrets=None, plugins=(), exclude_files_regex=Non collection = SecretsCollection( plugins, exclude_files=exclude_files_regex, + word_list_file=word_list_file, + word_list_hash=word_list_hash, ) if plugins: diff --git a/tests/core/secrets_collection_test.py b/tests/core/secrets_collection_test.py index b43e6e3b..5c8860ae 100644 --- a/tests/core/secrets_collection_test.py +++ b/tests/core/secrets_collection_test.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +import hashlib import json from contextlib import contextmanager from time import gmtime @@ -287,13 +288,14 @@ def setup(self): PrivateKeyDetector(), ), exclude_files_regex='foo', + word_list_file='will_be_mocked.txt', + word_list_hash='5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8', ) def test_output(self, mock_gmtime): assert ( self.logic.format_for_baseline_output() - - == self.get_point_twelve_and_later_baseline_dict(mock_gmtime) + == self.get_point_twelve_point_seven_and_later_baseline_dict(mock_gmtime) ) def test_load_baseline_from_string_with_pre_point_twelve_string(self, mock_gmtime): @@ -312,12 +314,12 @@ def test_load_baseline_from_string_with_pre_point_twelve_string(self, mock_gmtim assert secrets['exclude']['lines'] is None assert old_original['results'] == secrets['results'] - def test_load_baseline_from_string_with_point_twelve_and_later_string(self, mock_gmtime): + def test_load_baseline_from_string_with_point_twelve_to_twelve_six_string(self, mock_gmtime): """ We use load_baseline_from_string as a proxy to testing load_baseline_from_dict, because it's the most entry into the private function. """ - original = self.get_point_twelve_and_later_baseline_dict(mock_gmtime) + original = self.get_point_twelve_to_twelve_six_later_baseline_dict(mock_gmtime) secrets = SecretsCollection.load_baseline_from_string( json.dumps(original), @@ -327,6 +329,43 @@ def test_load_baseline_from_string_with_point_twelve_and_later_string(self, mock assert secrets['exclude']['lines'] is None assert original['results'] == secrets['results'] + def test_load_baseline_from_string_with_point_twelve_point_seven_and_later_string( + self, + mock_gmtime, + ): + """ + We use load_baseline_from_string as a proxy to testing load_baseline_from_dict, + because it's the most entry into the private function. + """ + original = self.get_point_twelve_point_seven_and_later_baseline_dict(mock_gmtime) + + word_list = """ + roller\n + """ + with mock_open_base( + data=word_list, + namespace='detect_secrets.util.open', + ): + secrets = SecretsCollection.load_baseline_from_string( + json.dumps(original), + ).format_for_baseline_output() + + # v0.12.7+ assertions + assert original['word_list']['file'] == secrets['word_list']['file'] + # Original hash is thrown out and replaced with new word list hash + assert ( + secrets['word_list']['hash'] + == + hashlib.sha1('roller'.encode('utf-8')).hexdigest() + != + original['word_list']['hash'] + ) + + # Regular assertions + assert original['exclude']['files'] == secrets['exclude']['files'] + assert secrets['exclude']['lines'] is None + assert original['results'] == secrets['results'] + def test_load_baseline_without_any_valid_fields(self, mock_log): with pytest.raises(IOError): SecretsCollection.load_baseline_from_string( @@ -346,7 +385,15 @@ def test_load_baseline_without_exclude(self, mock_log): ) assert mock_log.error_messages == 'Incorrectly formatted baseline!\n' - def get_point_twelve_and_later_baseline_dict(self, gmtime): + def get_point_twelve_point_seven_and_later_baseline_dict(self, gmtime): + # In v0.12.7 --word-list got added + baseline = self.get_point_twelve_to_twelve_six_later_baseline_dict(gmtime) + baseline['word_list'] = {} + baseline['word_list']['file'] = 'will_be_mocked.txt' + baseline['word_list']['hash'] = '5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8' + return baseline + + def get_point_twelve_to_twelve_six_later_baseline_dict(self, gmtime): # In v0.12.0 `exclude_regex` got replaced by `exclude` baseline = self._get_baseline_dict(gmtime) baseline['exclude'] = {} diff --git a/tests/main_test.py b/tests/main_test.py index ec69d625..fc023904 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -31,6 +31,8 @@ def test_scan_basic(self, mock_baseline_initialize): exclude_lines_regex=None, path='.', should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) def test_scan_with_rootdir(self, mock_baseline_initialize): @@ -43,6 +45,8 @@ def test_scan_with_rootdir(self, mock_baseline_initialize): exclude_lines_regex=None, path=['test_data'], should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) def test_scan_with_exclude_args(self, mock_baseline_initialize): @@ -57,6 +61,8 @@ def test_scan_with_exclude_args(self, mock_baseline_initialize): exclude_lines_regex='other_patt', path='.', should_scan_all_files=False, + word_list_file=None, + word_list_hash=None, ) @pytest.mark.parametrize( @@ -139,6 +145,8 @@ def test_scan_with_all_files_flag(self, mock_baseline_initialize): exclude_lines_regex=None, path='.', should_scan_all_files=True, + word_list_file=None, + word_list_hash=None, ) def test_reads_from_stdin(self, mock_merge_baseline): diff --git a/tests/plugins/common/filters_test.py b/tests/plugins/common/filters_test.py index 007d4d08..77e8c93a 100644 --- a/tests/plugins/common/filters_test.py +++ b/tests/plugins/common/filters_test.py @@ -6,12 +6,10 @@ class TestIsSequentialString: - # TODO: More tests should be had. - @pytest.mark.parametrize( 'secret', ( - # ascii sequence + # ASCII sequence 'ABCDEF', 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', @@ -27,13 +25,13 @@ class TestIsSequentialString: '0123456789abcdef', 'abcdef0123456789', - # base64 sequences + # Base64 sequences 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/', ), ) def test_success(self, secret): - assert filters.is_sequential_string(secret) + assert filters._is_sequential_string(secret) @pytest.mark.parametrize( 'secret', @@ -42,4 +40,4 @@ def test_success(self, secret): ), ) def test_failure(self, secret): - assert not filters.is_sequential_string(secret) + assert not filters._is_sequential_string(secret) diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 97dffc4e..c706718a 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -293,3 +293,5 @@ def test_discounts_when_all_numbers(self): # This makes sure it only occurs with numbers. assert self.logic.calculate_shannon_entropy('12345a') == \ original_scanner.calculate_shannon_entropy('12345a') + assert self.logic.calculate_shannon_entropy('0') == \ + original_scanner.calculate_shannon_entropy('0') diff --git a/tests/plugins/keyword_test.py b/tests/plugins/keyword_test.py index 3b7eb4bf..9b8866ba 100644 --- a/tests/plugins/keyword_test.py +++ b/tests/plugins/keyword_test.py @@ -1,10 +1,12 @@ from __future__ import absolute_import from __future__ import unicode_literals +import ahocorasick import pytest from detect_secrets.core.potential_secret import PotentialSecret from detect_secrets.plugins.keyword import KeywordDetector +from detect_secrets.util import is_python_2 from testing.mocks import mock_file_object @@ -193,6 +195,28 @@ def test_analyze_standard_positives(self, file_content): == PotentialSecret.hash_secret('m{{h}o)p${e]nob(ody[finds>-_$#thisone}}') ) + @pytest.mark.parametrize( + 'file_content', + STANDARD_POSITIVES, + ) + def test_analyze_standard_positives_with_automaton(self, file_content): + automaton = ahocorasick.Automaton() + + word = 'thisone' + if is_python_2(): # pragma: no cover + # Due to pyahocorasick + word = word.encode('utf-8') + automaton.add_word(word, word) + + automaton.make_automaton() + + logic = KeywordDetector(automaton=automaton) + + f = mock_file_object(file_content) + output = logic.analyze(f, 'mock_filename') + # All skipped due to automaton + assert len(output) == 0 + @pytest.mark.parametrize( 'file_content', STANDARD_POSITIVES, diff --git a/tests/pre_commit_hook_test.py b/tests/pre_commit_hook_test.py index fb04c82e..0aef962b 100644 --- a/tests/pre_commit_hook_test.py +++ b/tests/pre_commit_hook_test.py @@ -47,13 +47,18 @@ def test_file_with_secrets(self, mock_log): assert message_by_lines[3] == \ 'Location: test_data/files/file_with_secrets.py:3' + def test_file_with_secrets_with_word_list(self): + assert_commit_succeeds( + 'test_data/files/file_with_secrets.py --word-list test_data/word_list.txt', + ) + def test_file_no_secrets(self): assert_commit_succeeds('test_data/files/file_with_no_secrets.py') @pytest.mark.parametrize( 'has_result, use_private_key_scan, hook_command, commit_succeeds', [ - # basic case + # Basic case (True, True, '--baseline will_be_mocked test_data/files/file_with_secrets.py', True), # test_no_overwrite_pass_when_baseline_did_not_use_scanner (True, False, '--baseline will_be_mocked test_data/files/private_key', True), @@ -171,6 +176,10 @@ def test_that_baseline_gets_updated( assert original_baseline['exclude_regex'] == baseline_written['exclude']['files'] assert original_baseline['results'] == baseline_written['results'] + assert 'word_list' not in original_baseline + assert baseline_written['word_list']['file'] is None + assert baseline_written['word_list']['hash'] is None + # See that we updated the plugins and version assert current_version == baseline_written['version'] assert baseline_written['plugins_used'] == [ diff --git a/tests/util_test.py b/tests/util_test.py index bd03cdd6..db9c45ef 100644 --- a/tests/util_test.py +++ b/tests/util_test.py @@ -1,9 +1,12 @@ +import hashlib import subprocess import mock import pytest from detect_secrets import util +from detect_secrets.plugins.common import filters +from testing.mocks import mock_open GIT_REPO_SHA = b'cbb33d8c545ccf5c55fdcc7d5b0218078598e677' GIT_REMOTES_VERBOSE_ONE_URL = ( @@ -18,6 +21,26 @@ ) +def test_build_automaton(): + word_list = """ + foam\n + """ + with mock_open( + data=word_list, + namespace='detect_secrets.util.open', + ): + automaton, word_list_hash = util.build_automaton(word_list='will_be_mocked.txt') + assert word_list_hash == hashlib.sha1('foam'.encode('utf-8')).hexdigest() + assert filters._is_found_with_aho_corasick( + secret='foam_roller', + automaton=automaton, + ) + assert not filters._is_found_with_aho_corasick( + secret='no_words_in_word_list', + automaton=automaton, + ) + + def test_get_git_sha(): with mock.patch.object( subprocess, From 7bdf06fb7b41426a18a92df0d00b552e1395adf8 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:48:28 -0700 Subject: [PATCH 08/26] :100: Coverage for jwt.py --- detect_secrets/plugins/jwt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/plugins/jwt.py b/detect_secrets/plugins/jwt.py index 9868a01b..5128dbef 100644 --- a/detect_secrets/plugins/jwt.py +++ b/detect_secrets/plugins/jwt.py @@ -12,7 +12,7 @@ try: # Python 2 from future_builtins import filter -except ImportError: +except ImportError: # pragma: no cover # Python 3 pass From e4494ea8a088bc7b279c45a2f20a72cff89a525a Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Wed, 18 Sep 2019 21:49:37 -0700 Subject: [PATCH 09/26] :snake: Separate not :100: covered files --- tox.ini | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index c73719cd..d8dd3711 100644 --- a/tox.ini +++ b/tox.ini @@ -11,9 +11,11 @@ deps = -rrequirements-dev.txt whitelist_externals = coverage commands = coverage erase - coverage run -m pytest tests + coverage run -m pytest tests -v coverage report --show-missing --include=tests/* --fail-under 100 - coverage report --show-missing --include=detect_secrets/* --fail-under 98 + # This is so that we do not regress unintentionally + coverage report --show-missing --include=detect_secrets/* --omit=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py,detect_secrets/plugins/high_entropy_strings.py --fail-under 100 + coverage report --show-missing --include=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py,detect_secrets/plugins/high_entropy_strings.py --fail-under 97 pre-commit run --all-files --show-diff-on-failure [testenv:venv] From d3c9583f28df98741d2725b3a20be7eb2b109fdc Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Thu, 19 Sep 2019 10:35:56 -0700 Subject: [PATCH 10/26] :snake: Make automaton case-insensitive By .lower()ing when creating and retrieving --- detect_secrets/plugins/common/filters.py | 3 ++- detect_secrets/util.py | 3 ++- test_data/word_list.txt | 2 +- tests/plugins/high_entropy_strings_test.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/detect_secrets/plugins/common/filters.py b/detect_secrets/plugins/common/filters.py index e0fd0ebf..5c80244e 100644 --- a/detect_secrets/plugins/common/filters.py +++ b/detect_secrets/plugins/common/filters.py @@ -46,7 +46,8 @@ def _is_found_with_aho_corasick(secret, automaton): secret = secret.encode('utf-8') try: - next(automaton.iter(string=secret)) + # .lower() to make everything case-insensitive + next(automaton.iter(string=secret.lower())) return True except StopIteration: return False diff --git a/detect_secrets/util.py b/detect_secrets/util.py index bb8f688c..4ce8fbe0 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -30,7 +30,8 @@ def build_automaton(word_list): with open(word_list) as f: for line in f: - line = line.strip() + # .lower() to make everything case-insensitive + line = line.lower().strip() if line: word_list_hash = hashlib.sha1( (word_list_hash + line).encode('utf-8'), diff --git a/test_data/word_list.txt b/test_data/word_list.txt index 7c3fc6a2..01ec8ac8 100644 --- a/test_data/word_list.txt +++ b/test_data/word_list.txt @@ -1 +1 @@ -c3VwZXIg +c3vwzxig diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index c706718a..671197d1 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -148,7 +148,7 @@ def setup(self): base64_limit=4.5, exclude_lines_regex='CanonicalUser', ), - non_secret_string='c3VwZXIgc2VjcmV0IHZhbHVl', # too short for high entropy + non_secret_string='c3VwZXIgc2VjcmV0IHZhbHVl', # Too short for high entropy secret_string='c3VwZXIgbG9uZyBzdHJpbmcgc2hvdWxkIGNhdXNlIGVub3VnaCBlbnRyb3B5', ) From 9e3669d51cfab500d3cdbd0dcf9156ccd7bf9e16 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Thu, 19 Sep 2019 14:26:13 -0700 Subject: [PATCH 11/26] :snake: Slightly better condition for is_verified = True --- detect_secrets/plugins/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index d4695fec..bc654039 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -68,7 +68,7 @@ def analyze(self, file, filename): ) is_verified = self.verify(result.secret_value, content=str(snippet)) - if is_verified != VerifiedResult.UNVERIFIED: + if is_verified == VerifiedResult.VERIFIED_TRUE: result.is_verified = True if is_verified != VerifiedResult.VERIFIED_FALSE: From 7ce4b85ecffc776b80117b1e38a4d194e7a8908a Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Thu, 19 Sep 2019 17:40:14 -0700 Subject: [PATCH 12/26] :telescope: Don't put words less than 4 chars in automaton --- detect_secrets/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/detect_secrets/util.py b/detect_secrets/util.py index 4ce8fbe0..7aaf45bc 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -32,7 +32,7 @@ def build_automaton(word_list): for line in f: # .lower() to make everything case-insensitive line = line.lower().strip() - if line: + if len(line) > 3: word_list_hash = hashlib.sha1( (word_list_hash + line).encode('utf-8'), ).hexdigest() From de7fbd265942b1d404a3dc7292db386312a9b63d Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Thu, 19 Sep 2019 17:41:28 -0700 Subject: [PATCH 13/26] :100: Coverage for high_entropy_strings.py - :snake: Refactor hadouken code - :snake: Remove high_entropy_strings.py from the uncovered files list in tox --- .../plugins/high_entropy_strings.py | 50 +++++++++---------- tox.ini | 4 +- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index 82aa3c73..b5f3087e 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -196,36 +196,34 @@ def _analyze_yaml_file(self, file, filename): while len(to_search) > 0: item = to_search.pop() - try: - if '__line__' in item and item['__line__'] not in ignored_lines: - # An isinstance check doesn't work in py2 - # so we need the __is_binary__ field. - string_to_scan = ( - self.decode_binary(item['__value__']) - if item['__is_binary__'] - else item['__value__'] - ) - - secrets = self.analyze_string( - string_to_scan, - item['__line__'], - filename, - ) - - if item['__is_binary__']: - secrets = self._encode_yaml_binary_secrets(secrets) - - potential_secrets.update(secrets) - - if '__line__' in item: - continue - + if '__line__' not in item: for key in item: obj = item[key] if isinstance(item, dict) else key if isinstance(obj, dict): to_search.append(obj) - except TypeError: - pass + continue + + if item['__line__'] in ignored_lines: + continue + + # An isinstance check doesn't work in py2 + # so we need the __is_binary__ field. + string_to_scan = ( + self.decode_binary(item['__value__']) + if item['__is_binary__'] + else item['__value__'] + ) + + secrets = self.analyze_string( + string_to_scan, + item['__line__'], + filename, + ) + + if item['__is_binary__']: + secrets = self._encode_yaml_binary_secrets(secrets) + + potential_secrets.update(secrets) return potential_secrets diff --git a/tox.ini b/tox.ini index d8dd3711..f9baf1a5 100644 --- a/tox.ini +++ b/tox.ini @@ -14,8 +14,8 @@ commands = coverage run -m pytest tests -v coverage report --show-missing --include=tests/* --fail-under 100 # This is so that we do not regress unintentionally - coverage report --show-missing --include=detect_secrets/* --omit=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py,detect_secrets/plugins/high_entropy_strings.py --fail-under 100 - coverage report --show-missing --include=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py,detect_secrets/plugins/high_entropy_strings.py --fail-under 97 + coverage report --show-missing --include=detect_secrets/* --omit=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py --fail-under 100 + coverage report --show-missing --include=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py --fail-under 96 pre-commit run --all-files --show-diff-on-failure [testenv:venv] From 139c64ab0edffebfaadb421aa2fbf5ec42d80615 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Fri, 20 Sep 2019 17:19:36 -0700 Subject: [PATCH 14/26] :snake: Refactor `audit --display-results` code - Change `is_secret` string names to `(true|false)-positives` (Negative meaning false-positive was confusing.) - Replace list of secrets with {filename: {plaintext: line:}} - Replace top-level `results` key with `plugins` since we have a results key already --- detect_secrets/core/audit.py | 52 +++++++++++++++++++++++++----------- tests/core/audit_test.py | 42 ++++++++++++++++++----------- tests/main_test.py | 13 ++++++--- 3 files changed, 71 insertions(+), 36 deletions(-) diff --git a/detect_secrets/core/audit.py b/detect_secrets/core/audit.py index f3ad5b27..11fbf0b0 100644 --- a/detect_secrets/core/audit.py +++ b/detect_secrets/core/audit.py @@ -41,16 +41,16 @@ class RedundantComparisonError(Exception): AUDIT_RESULT_TO_STRING = { - True: 'positive', - False: 'negative', - None: 'unknown', + True: 'true-positives', + False: 'false-positives', + None: 'unknowns', } EMPTY_PLUGIN_AUDIT_RESULT = { 'results': { - 'positive': [], - 'negative': [], - 'unknown': [], + 'true-positives': defaultdict(list), + 'false-positives': defaultdict(list), + 'unknowns': defaultdict(list), }, 'config': {}, } @@ -208,12 +208,30 @@ def determine_audit_results(baseline, baseline_path): Given a baseline which has been audited, returns a dictionary describing the results of each plugin in the following form: { - "results": { + "plugins": { "plugin_name1": { "results": { - "positive": [list of secrets with is_secret: true caught by this plugin], - "negative": [list of secrets with is_secret: false caught by this plugin], - "unknown": [list of secrets with no is_secret entry caught by this plugin] + "true-positives": [ + list of { + filename: { + 'line': '...', + 'plaintext':'...', + } + } for secrets with `is_secret: true` caught by this plugin], + "false-positives": [ + list of { + filename: { + 'line': '...', + 'plaintext':'...', + } + } for secrets with `is_secret: false` caught by this plugin], + "unknowns": [ + list of { + filename: { + 'line': '...', + 'plaintext':'...', + } + } for secrets with no `is_secret` entry caught by this plugin] }, "config": {configuration used for the plugin} }, @@ -228,7 +246,7 @@ def determine_audit_results(baseline, baseline_path): all_secrets = _secret_generator(baseline) audit_results = { - 'results': defaultdict(lambda: deepcopy(EMPTY_PLUGIN_AUDIT_RESULT)), + 'plugins': defaultdict(lambda: deepcopy(EMPTY_PLUGIN_AUDIT_RESULT)), } secret_type_to_plugin_name = get_mapping_from_secret_type_to_class_name() @@ -236,26 +254,28 @@ def determine_audit_results(baseline, baseline_path): for filename, secret in all_secrets: file_contents = _open_file_with_cache(filename) + secret_info = {} + secret_info['line'] = _get_file_line(filename, secret['line_number']) try: - secret_plaintext = get_raw_secret_value( + secret_info['plaintext'] = get_raw_secret_value( secret=secret, plugin_settings=baseline['plugins_used'], file_handle=io.StringIO(file_contents), filename=filename, ) except SecretNotFoundOnSpecifiedLineError: - secret_plaintext = _get_file_line(filename, secret['line_number']) + secret_info['plaintext'] = None plugin_name = secret_type_to_plugin_name[secret['type']] audit_result = AUDIT_RESULT_TO_STRING[secret.get('is_secret')] - audit_results['results'][plugin_name]['results'][audit_result].append(secret_plaintext) + audit_results['plugins'][plugin_name]['results'][audit_result][filename].append(secret_info) for plugin_config in baseline['plugins_used']: plugin_name = plugin_config['name'] - if plugin_name not in audit_results['results']: + if plugin_name not in audit_results['plugins']: continue - audit_results['results'][plugin_name]['config'].update(plugin_config) + audit_results['plugins'][plugin_name]['config'].update(plugin_config) git_repo_path = os.path.dirname(os.path.abspath(baseline_path)) git_sha = get_git_sha(git_repo_path) diff --git a/tests/core/audit_test.py b/tests/core/audit_test.py index e6e22ae8..ccc3eb30 100644 --- a/tests/core/audit_test.py +++ b/tests/core/audit_test.py @@ -529,7 +529,7 @@ def get_audited_baseline( baseline_fixture = { 'plugins_used': plugins_used, 'results': { - 'file': [ + 'mocked_file': [ { 'hashed_secret': 'a837eb90d815a852f68f56f70b1b3fab24c46c84', 'line_number': 1, @@ -540,17 +540,17 @@ def get_audited_baseline( } if is_secret is not None: - baseline_fixture['results']['file'][0]['is_secret'] = is_secret + baseline_fixture['results']['mocked_file'][0]['is_secret'] = is_secret return baseline_fixture @pytest.mark.parametrize( 'plugins_used', [ - # NOTE: The first config here needs to be + # Note: The first config here needs to be # the HexHighEntropyString config for this test to work. - [{'name': 'HexHighEntropyString'}], # plugin w/o config - [{'name': 'HexHighEntropyString', 'hex_limit': 2}], # plugin w/config + [{'name': 'HexHighEntropyString'}], # Plugin w/o config + [{'name': 'HexHighEntropyString', 'hex_limit': 2}], # Plugin w/config [ {'name': 'HexHighEntropyString'}, {'name': 'Base64HighEntropyString'}, @@ -570,15 +570,18 @@ def test_determine_audit_results_plugin_config( results = audit.determine_audit_results(baseline, '.secrets.baseline') - assert results['results']['HexHighEntropyString']['config'].items() \ - >= plugins_used[0].items() + assert ( + results['plugins']['HexHighEntropyString']['config'].items() + >= + plugins_used[0].items() + ) @pytest.mark.parametrize( 'is_secret, expected_audited_result', [ - (True, 'positive'), - (False, 'negative'), - (None, 'unknown'), + (True, 'true-positives'), + (False, 'false-positives'), + (None, 'unknowns'), ], ) def test_determine_audit_results_is_secret( @@ -595,12 +598,17 @@ def test_determine_audit_results_is_secret( results = audit.determine_audit_results(baseline, '.secrets.baseline') - for audited_result, list_of_secrets \ - in results['results']['HexHighEntropyString']['results'].items(): + for ( + audited_result, + file_to_secrets, + ) in results['plugins']['HexHighEntropyString']['results'].items(): if audited_result == expected_audited_result: - assert plaintext_secret in list_of_secrets + assert any( # pragma: no cover + secret['plaintext'] == plaintext_secret + for secret in file_to_secrets['mocked_file'] + ) else: - assert len(list_of_secrets) == 0 + assert len(file_to_secrets) == 0 @pytest.mark.parametrize( 'git_remotes, git_sha, expected_git_info', @@ -662,8 +670,10 @@ def test_determine_audit_results_secret_not_found( ): results = audit.determine_audit_results(baseline, '.secrets.baseline') - assert whole_plaintext_line in \ - results['results']['HexHighEntropyString']['results']['positive'] + hex_high_results = results['plugins']['HexHighEntropyString']['results'] + assert len(hex_high_results['true-positives']['mocked_file']) == 1 + assert hex_high_results['true-positives']['mocked_file'][0]['line'] == whole_plaintext_line + assert hex_high_results['true-positives']['mocked_file'][0]['plaintext'] is None class TestPrintAuditResults(): diff --git a/tests/main_test.py b/tests/main_test.py index fc023904..33178f7d 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -583,9 +583,14 @@ def test_audit_short_file(self, filename, expected_output): 'name': 'KeywordDetector', }, 'results': { - 'negative': [], - 'positive': [], - 'unknown': ['nothighenoughentropy'], + 'false-positives': {}, + 'true-positives': {}, + 'unknowns': { + 'test_data/short_files/first_line.php': [{ + 'line': "secret = 'notHighEnoughEntropy'", + 'plaintext': 'nothighenoughentropy', + }], + }, }, }, }, @@ -608,7 +613,7 @@ def test_audit_display_results(self, filename, expected_output): ) as printer_shim: main(['audit', '--display-results', 'MOCKED']) - assert json.loads(uncolor(printer_shim.message))['results'] == expected_output + assert json.loads(uncolor(printer_shim.message))['plugins'] == expected_output def test_audit_diff_not_enough_files(self): assert main('audit --diff fileA'.split()) == 1 From 6c6e028210261bb15745599db33622eb49a1c342 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Fri, 20 Sep 2019 21:07:37 -0700 Subject: [PATCH 15/26] :mortar_board: Use plural variable name for list --- detect_secrets/plugins/aws.py | 8 ++++---- tests/plugins/aws_key_test.py | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/detect_secrets/plugins/aws.py b/detect_secrets/plugins/aws.py index f99d1d06..fc3cf442 100644 --- a/detect_secrets/plugins/aws.py +++ b/detect_secrets/plugins/aws.py @@ -25,18 +25,18 @@ class AWSKeyDetector(RegexBasedDetector): ) def verify(self, token, content): - secret_access_key = get_secret_access_key(content) - if not secret_access_key: + secret_access_key_candidates = get_secret_access_keys(content) + if not secret_access_key_candidates: return VerifiedResult.UNVERIFIED - for candidate in secret_access_key: + for candidate in secret_access_key_candidates: if verify_aws_secret_access_key(token, candidate): return VerifiedResult.VERIFIED_TRUE return VerifiedResult.VERIFIED_FALSE -def get_secret_access_key(content): +def get_secret_access_keys(content): # AWS secret access keys are 40 characters long. regex = re.compile( r'= *([\'"]?)([%s]{40})(\1)$' % ( diff --git a/tests/plugins/aws_key_test.py b/tests/plugins/aws_key_test.py index 1f0e3eea..10a481f0 100644 --- a/tests/plugins/aws_key_test.py +++ b/tests/plugins/aws_key_test.py @@ -8,7 +8,7 @@ from detect_secrets.core.constants import VerifiedResult from detect_secrets.plugins.aws import AWSKeyDetector -from detect_secrets.plugins.aws import get_secret_access_key +from detect_secrets.plugins.aws import get_secret_access_keys from testing.mocks import mock_file_object @@ -119,7 +119,7 @@ def counter(*args, **kwargs): [EXAMPLE_SECRET], ), - # multiple candidates + # Multiple candidates ( textwrap.dedent(""" base64_keyA = '{}' @@ -142,4 +142,4 @@ def counter(*args, **kwargs): ), ) def test_get_secret_access_key(content, expected_output): - assert get_secret_access_key(content) == expected_output + assert get_secret_access_keys(content) == expected_output From 977c4fb5606b42a9c73dfb598fa0a6cd0ab77c90 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Fri, 20 Sep 2019 21:09:28 -0700 Subject: [PATCH 16/26] :tada: Add verification for Mailchimp API keys --- detect_secrets/plugins/mailchimp.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/detect_secrets/plugins/mailchimp.py b/detect_secrets/plugins/mailchimp.py index 9b92ea95..56489051 100644 --- a/detect_secrets/plugins/mailchimp.py +++ b/detect_secrets/plugins/mailchimp.py @@ -4,8 +4,12 @@ from __future__ import absolute_import import re +from base64 import b64encode + +import requests from .base import RegexBasedDetector +from detect_secrets.core.constants import VerifiedResult class MailchimpDetector(RegexBasedDetector): @@ -13,6 +17,24 @@ class MailchimpDetector(RegexBasedDetector): secret_type = 'Mailchimp Access Key' denylist = ( - # Mailchimp key - re.compile(r'[0-9a-z]{32}(-us[0-9]{1,2})'), + re.compile(r'[0-9a-z]{32}-us[0-9]{1,2}'), ) + + def verify(self, token, **kwargs): # pragma: no cover + _, datacenter_number = token.split('-us') + + response = requests.get( + 'https://us{}.api.mailchimp.com/3.0/'.format( + datacenter_number, + ), + headers={ + 'Authorization': b'Basic ' + b64encode( + 'any_user:{}'.format(token).encode('utf-8'), + ), + }, + ) + return ( + VerifiedResult.VERIFIED_TRUE + if response.status_code == 200 + else VerifiedResult.VERIFIED_FALSE + ) From 9cabbe078c16ce476400859ebbdf160c82f6ea80 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Fri, 20 Sep 2019 21:28:40 -0700 Subject: [PATCH 17/26] :tada: Add verification for Stripe secret API keys --- detect_secrets/plugins/stripe.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/detect_secrets/plugins/stripe.py b/detect_secrets/plugins/stripe.py index 2c94420f..8d9e0217 100644 --- a/detect_secrets/plugins/stripe.py +++ b/detect_secrets/plugins/stripe.py @@ -4,8 +4,12 @@ from __future__ import absolute_import import re +from base64 import b64encode + +import requests from .base import RegexBasedDetector +from detect_secrets.core.constants import VerifiedResult class StripeDetector(RegexBasedDetector): @@ -16,3 +20,22 @@ class StripeDetector(RegexBasedDetector): # Stripe standard keys begin with sk_live and restricted with rk_live re.compile(r'(?:r|s)k_live_[0-9a-zA-Z]{24}'), ) + + def verify(self, token, **kwargs): # pragma: no cover + response = requests.get( + 'https://api.stripe.com/v1/charges', + headers={ + 'Authorization': b'Basic ' + b64encode( + '{}:'.format(token).encode('utf-8'), + ), + }, + ) + + if response.status_code == 200: + return VerifiedResult.VERIFIED_TRUE + + # Restricted keys may be limited to certain endpoints + if token.startswith('rk_live'): + return VerifiedResult.UNVERIFIED + + return VerifiedResult.VERIFIED_FALSE From 20d19212de53ae6175cfb2b4aebfd68fe0c18af8 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Fri, 20 Sep 2019 21:37:27 -0700 Subject: [PATCH 18/26] :100: Coverage for baseline.py initialize.py usage.py And normalize main_test.py comments --- detect_secrets/core/usage.py | 6 +--- detect_secrets/plugins/common/initialize.py | 4 +-- tests/core/baseline_test.py | 31 ++++++++++++++++++++- tests/main_test.py | 21 ++++++++------ tests/plugins/common/initialize_test.py | 10 +++++-- tox.ini | 4 +-- 6 files changed, 55 insertions(+), 21 deletions(-) diff --git a/detect_secrets/core/usage.py b/detect_secrets/core/usage.py index 3e933d7a..7626d639 100644 --- a/detect_secrets/core/usage.py +++ b/detect_secrets/core/usage.py @@ -419,11 +419,7 @@ def consolidate_args(args): # Consolidate related args related_args = {} for related_arg_tuple in plugin.related_args: - try: - flag_name, default_value = related_arg_tuple - except ValueError: - flag_name = related_arg_tuple - default_value = None + flag_name, default_value = related_arg_tuple arg_name = PluginOptions._convert_flag_text_to_argument_name( flag_name, diff --git a/detect_secrets/plugins/common/initialize.py b/detect_secrets/plugins/common/initialize.py index fb8696d0..bcb68924 100644 --- a/detect_secrets/plugins/common/initialize.py +++ b/detect_secrets/plugins/common/initialize.py @@ -109,9 +109,9 @@ def _remove_key(d, key): ): try: plugins_dict[plugin_name][param_name] = param_value - except KeyError: + except KeyError: # pragma: no cover log.warning( - 'Baseline contain plugin {} which is not in all plugins! Ignoring...', + 'Baseline contains plugin {} which is not in all plugins! Ignoring...', plugin_name, ) diff --git a/tests/core/baseline_test.py b/tests/core/baseline_test.py index 03d7e1dc..8c50e619 100644 --- a/tests/core/baseline_test.py +++ b/tests/core/baseline_test.py @@ -60,6 +60,29 @@ def test_basic_usage(self, path): assert len(results['test_data/files/file_with_secrets.py']) == 1 assert len(results['test_data/files/tmp/file_with_secrets.py']) == 2 + @pytest.mark.parametrize( + 'path', + [ + [ + './test_data/files', + ], + ], + ) + def test_error_when_getting_git_tracked_files(self, path): + with mock_git_calls( + 'detect_secrets.core.baseline.subprocess.check_output', + ( + SubprocessMock( + expected_input='git -C ./test_data/files ls-files', + should_throw_exception=True, + mocked_output='', + ), + ), + ): + results = self.get_results(path=['./test_data/files']) + + assert not results + def test_with_multiple_files(self): results = self.get_results( path=[ @@ -80,7 +103,13 @@ def test_with_multiple_non_existent_files(self): assert not results def test_with_folders_and_files(self): - results = self.get_results(path=['test_data/', 'non-existent-file.B']) + results = self.get_results( + path=[ + 'non-existent-file.B', + 'test_data/', + 'test_data/empty_folder', + ], + ) assert 'test_data/files/file_with_secrets.py' in results assert 'test_data/files/tmp/file_with_secrets.py' in results diff --git a/tests/main_test.py b/tests/main_test.py index 33178f7d..647ab882 100644 --- a/tests/main_test.py +++ b/tests/main_test.py @@ -221,7 +221,7 @@ def test_old_baseline_ignored_with_update_flag( @pytest.mark.parametrize( 'plugins_used, plugins_overwriten, plugins_wrote', [ - ( # remove some plugins from baseline + ( # Remove some plugins from baseline [ { 'base64_limit': 4.5, @@ -238,7 +238,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # all plugins + ( # All plugins [ { 'base64_limit': 1.5, @@ -284,7 +284,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # remove some plugins from all plugins + ( # Remove some plugins from all plugins [ { 'base64_limit': 4.5, @@ -324,7 +324,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # use same plugin list from baseline + ( # Use same plugin list from baseline [ { 'base64_limit': 3.5, @@ -345,7 +345,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # overwrite base limit from CLI + ( # Overwrite base limit from CLI [ { 'base64_limit': 3.5, @@ -365,7 +365,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # does not overwrite base limit from CLI if baseline not using the plugin + ( # Does not overwrite base limit from CLI if baseline not using the plugin [ { 'name': 'PrivateKeyDetector', @@ -378,7 +378,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # use overwriten option from CLI only when using --use-all-plugins + ( # Use overwriten option from CLI only when using --use-all-plugins [ { 'base64_limit': 3.5, @@ -420,7 +420,7 @@ def test_old_baseline_ignored_with_update_flag( }, ], ), - ( # use plugin limit from baseline when using --use-all-plugins and no input limit + ( # Use plugin limit from baseline when using --use-all-plugins and no input limit [ { 'base64_limit': 2.5, @@ -492,8 +492,11 @@ def test_plugin_from_old_baseline_respected_with_update_flag( ), ) == 0 - assert file_writer.call_args[1]['data']['plugins_used'] == \ + assert ( + file_writer.call_args[1]['data']['plugins_used'] + == plugins_wrote + ) @pytest.mark.parametrize( 'filename, expected_output', diff --git a/tests/plugins/common/initialize_test.py b/tests/plugins/common/initialize_test.py index 7b5161b0..ffdf80bf 100644 --- a/tests/plugins/common/initialize_test.py +++ b/tests/plugins/common/initialize_test.py @@ -55,7 +55,7 @@ def setup(self): def test_success(self): plugin = initialize.from_secret_type( 'Base64 High Entropy String', - self.settings, + settings=self.settings, ) assert isinstance(plugin, Base64HighEntropyString) @@ -64,5 +64,11 @@ def test_success(self): def test_failure(self): assert not initialize.from_secret_type( 'some random secret_type', - self.settings, + settings=self.settings, + ) + + def test_secret_type_not_in_settings(self): + assert not initialize.from_secret_type( + 'Base64 High Entropy String', + settings=[], ) diff --git a/tox.ini b/tox.ini index f9baf1a5..ab646843 100644 --- a/tox.ini +++ b/tox.ini @@ -14,8 +14,8 @@ commands = coverage run -m pytest tests -v coverage report --show-missing --include=tests/* --fail-under 100 # This is so that we do not regress unintentionally - coverage report --show-missing --include=detect_secrets/* --omit=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py --fail-under 100 - coverage report --show-missing --include=detect_secrets/core/audit.py,detect_secrets/core/baseline.py,detect_secrets/core/secrets_collection.py,detect_secrets/core/usage.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py,detect_secrets/plugins/common/initialize.py --fail-under 96 + coverage report --show-missing --include=detect_secrets/* --omit=detect_secrets/core/audit.py,detect_secrets/core/secrets_collection.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py --fail-under 100 + coverage report --show-missing --include=detect_secrets/core/audit.py,detect_secrets/core/secrets_collection.py,detect_secrets/main.py,detect_secrets/plugins/common/ini_file_parser.py --fail-under 96 pre-commit run --all-files --show-diff-on-failure [testenv:venv] From 433b75e82347b1c806af3723ae2d8f3c70fc3827 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 12:25:10 -0700 Subject: [PATCH 19/26] :snake: Add stats to `audit --display-results` code --- detect_secrets/core/audit.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/detect_secrets/core/audit.py b/detect_secrets/core/audit.py index 11fbf0b0..1e5176bc 100644 --- a/detect_secrets/core/audit.py +++ b/detect_secrets/core/audit.py @@ -54,6 +54,21 @@ class RedundantComparisonError(Exception): }, 'config': {}, } +EMPTY_STATS_RESULT = { + 'signal': 0, + 'true-positives': { + 'count': 0, + 'files': defaultdict(int), + }, + 'false-positives': { + 'count': 0, + 'files': defaultdict(int), + }, + 'unknowns': { + 'count': 0, + 'files': defaultdict(int), + }, +} def audit_baseline(baseline_filename): @@ -247,10 +262,12 @@ def determine_audit_results(baseline, baseline_path): audit_results = { 'plugins': defaultdict(lambda: deepcopy(EMPTY_PLUGIN_AUDIT_RESULT)), + 'stats': deepcopy(EMPTY_STATS_RESULT), } secret_type_to_plugin_name = get_mapping_from_secret_type_to_class_name() + total = 0 for filename, secret in all_secrets: file_contents = _open_file_with_cache(filename) @@ -270,6 +287,17 @@ def determine_audit_results(baseline, baseline_path): audit_result = AUDIT_RESULT_TO_STRING[secret.get('is_secret')] audit_results['plugins'][plugin_name]['results'][audit_result][filename].append(secret_info) + audit_results['stats'][audit_result]['count'] += 1 + audit_results['stats'][audit_result]['files'][filename] += 1 + total += 1 + audit_results['stats']['signal'] = str( + ( + audit_results['stats']['true-positives']['count'] + / + total + ) * 100, + )[:4] + '%' + for plugin_config in baseline['plugins_used']: plugin_name = plugin_config['name'] if plugin_name not in audit_results['plugins']: From a096600b4f231c0366e94764fe4e59d8d140edac Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 12:27:06 -0700 Subject: [PATCH 20/26] :performing_arts: Add more to `IGNORED_FILE_EXTENSIONS` --- detect_secrets/core/constants.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/detect_secrets/core/constants.py b/detect_secrets/core/constants.py index 2fb5ca6a..556ea5ba 100644 --- a/detect_secrets/core/constants.py +++ b/detect_secrets/core/constants.py @@ -15,6 +15,7 @@ 'bmp', 'bz2', 'dmg', + 'eot', 'exe', 'gif', 'gz', @@ -22,6 +23,7 @@ 'jar', 'jpg', 'jpeg', + 'mo', 'png', 'rar', 'realm', @@ -30,7 +32,11 @@ 'tar', 'tif', 'tiff', + 'ttf', 'webp', + 'woff', + 'xls', + 'xlsx', 'zip', ) ) From ef5d0006cc953784631f19f7de72ba3ab5972def Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 12:28:06 -0700 Subject: [PATCH 21/26] :bug: Fix auditing a baseline with MailChimp in it --- detect_secrets/plugins/common/util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/detect_secrets/plugins/common/util.py b/detect_secrets/plugins/common/util.py index d604aa73..870273ba 100644 --- a/detect_secrets/plugins/common/util.py +++ b/detect_secrets/plugins/common/util.py @@ -13,6 +13,7 @@ from ..high_entropy_strings import HexHighEntropyString # noqa: F401 from ..jwt import JwtTokenDetector # noqa: F401 from ..keyword import KeywordDetector # noqa: F401 +from ..mailchimp import MailchimpDetector # noqa: F401 from ..private_key import PrivateKeyDetector # noqa: F401 from ..slack import SlackDetector # noqa: F401 from ..stripe import StripeDetector # noqa: F401 From a4933565303ed189aa197bbe77c1c0602fd98fab Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 12:29:16 -0700 Subject: [PATCH 22/26] :bug: Fix TypeError thrown when Yaml was all comments --- detect_secrets/plugins/high_entropy_strings.py | 4 ++++ test_data/only_comments.yaml | 1 + tests/plugins/high_entropy_strings_test.py | 6 +++++- 3 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 test_data/only_comments.yaml diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index b5f3087e..caedfc09 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -188,6 +188,10 @@ def _analyze_yaml_file(self, file, filename): exclude_lines_regex=self.exclude_lines_regex, ) data = parser.json() + # If the file is all comments + if not data: + raise yaml.YAMLError + ignored_lines = parser.get_ignored_lines() potential_secrets = {} diff --git a/test_data/only_comments.yaml b/test_data/only_comments.yaml new file mode 100644 index 00000000..6ea3a968 --- /dev/null +++ b/test_data/only_comments.yaml @@ -0,0 +1 @@ +# This YAML file only has comments, let's not crash on it diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 671197d1..02679858 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -202,7 +202,7 @@ def test_ini_file(self, filename, secrets): assert count == len(secrets) - def test_yaml_file(self): + def test_yaml_files(self): plugin = Base64HighEntropyString( base64_limit=3, exclude_lines_regex='CanonicalUser', @@ -220,6 +220,10 @@ def test_yaml_file(self): 'Location: test_data/config.yaml:15', ) + with open('test_data/only_comments.yaml') as f: + secrets = plugin.analyze(f, 'test_data/only_comments.yaml') + assert not secrets.values() + def test_env_file(self): plugin = Base64HighEntropyString(4.5) with open('test_data/config.env') as f: From 171eeca90c7841e41bbd3687397e361876a46346 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 12:31:04 -0700 Subject: [PATCH 23/26] :bug: Fix scanning files that don't exist When following a symlink, we just subtracted the `cwd` from the path. This caused us to scan non-existant files. --- detect_secrets/core/baseline.py | 30 +++++++++++++++++------------- detect_secrets/util.py | 15 +++++++++++---- tests/core/baseline_test.py | 26 ++++++++++++++++++++++++-- tests/util_test.py | 13 +++++++++++++ 4 files changed, 65 insertions(+), 19 deletions(-) diff --git a/detect_secrets/core/baseline.py b/detect_secrets/core/baseline.py index 6e573ac9..ccd64eb3 100644 --- a/detect_secrets/core/baseline.py +++ b/detect_secrets/core/baseline.py @@ -55,11 +55,13 @@ def initialize( for element in path: if os.path.isdir(element): if should_scan_all_files: - files_to_scan.extend(_get_files_recursively(element)) + files_to_scan.extend( + _get_files_recursively(element), + ) else: - files = _get_git_tracked_files(element) - if files: - files_to_scan.extend(files) + files_to_scan.extend( + _get_git_tracked_files(element), + ) elif os.path.isfile(element): files_to_scan.append(element) else: @@ -77,7 +79,7 @@ def initialize( files_to_scan, ) - for file in files_to_scan: + for file in sorted(files_to_scan): output.scan_file(file) return output @@ -279,6 +281,7 @@ def _get_git_tracked_files(rootdir='.'): :rtype: set|None :returns: filepaths to files which git currently tracks (locally) """ + output = [] try: with open(os.devnull, 'w') as fnull: git_files = subprocess.check_output( @@ -289,13 +292,13 @@ def _get_git_tracked_files(rootdir='.'): ], stderr=fnull, ) - - return set([ - util.get_relative_path(rootdir, filename) - for filename in git_files.decode('utf-8').split() - ]) + for filename in git_files.decode('utf-8').split(): + relative_path = util.get_relative_path_if_in_cwd(rootdir, filename) + if relative_path: + output.append(relative_path) except subprocess.CalledProcessError: - return None + pass + return output def _get_files_recursively(rootdir): @@ -305,6 +308,7 @@ def _get_files_recursively(rootdir): output = [] for root, _, files in os.walk(rootdir): for filename in files: - output.append(util.get_relative_path(root, filename)) - + relative_path = util.get_relative_path_if_in_cwd(root, filename) + if relative_path: + output.append(relative_path) return output diff --git a/detect_secrets/util.py b/detect_secrets/util.py index 7aaf45bc..4f873f8c 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -55,11 +55,18 @@ def get_root_directory(): # pragma: no cover ) -def get_relative_path(root, path): - """Returns relative path, after following symlinks.""" - return os.path.realpath( - os.path.join(root, path), +def get_relative_path_if_in_cwd(root, filepath): + """Returns relative path, after following symlinks, + if in current working directory. + + :rtype: str|None + """ + filepath = os.path.realpath( + os.path.join(root, filepath), )[len(os.getcwd() + '/'):] + if os.path.isfile(filepath): + return filepath + return None def get_git_sha(path): diff --git a/tests/core/baseline_test.py b/tests/core/baseline_test.py index 8c50e619..bf0c08d2 100644 --- a/tests/core/baseline_test.py +++ b/tests/core/baseline_test.py @@ -97,8 +97,18 @@ def test_with_multiple_files(self): assert 'test_data/files/tmp/file_with_secrets.py' in results def test_with_multiple_non_existent_files(self): - results = self.get_results(path=['non-existent-file.A', 'non-existent-file.B']) - + with mock.patch( + 'detect_secrets.core.baseline.util.get_relative_path_if_in_cwd', + return_value=None, + ): + results = self.get_results( + path=[ + 'non-existent-file.A', + 'non-existent-file.B', + # Will be non-existant due to mock.patch + 'test_data/files/tmp/', + ], + ) # No expected results, because files don't exist assert not results @@ -163,6 +173,18 @@ def test_scan_all_files(self): ) assert len(results.keys()) == 2 + def test_scan_all_files_with_bad_symlinks(self): + with mock.patch( + 'detect_secrets.core.baseline.util.get_relative_path_if_in_cwd', + return_value=None, + ): + results = self.get_results( + # Will be non-existant due to mock.patch + path=['test_data/files'], + scan_all_files=True, + ) + assert len(results.keys()) == 0 + class TestGetSecretsNotInBaseline(object): diff --git a/tests/util_test.py b/tests/util_test.py index db9c45ef..5b80c9db 100644 --- a/tests/util_test.py +++ b/tests/util_test.py @@ -51,6 +51,19 @@ def test_get_git_sha(): assert util.get_git_sha('.') == GIT_REPO_SHA.decode('utf-8') +def test_get_relative_path_if_in_cwd(): + with mock.patch( + 'detect_secrets.util.os.path.isfile', + return_value=False, + ): + assert ( + util.get_relative_path_if_in_cwd( + 'test_data', + 'config.env', + ) is None + ) + + @pytest.mark.parametrize( 'git_remotes_result, expected_urls', [ From 69a3aacae36cc33547e49da66820e94e5d6da8d3 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 18:26:24 -0700 Subject: [PATCH 24/26] :snake: Use hash.update to be more succinct --- detect_secrets/util.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/detect_secrets/util.py b/detect_secrets/util.py index 4f873f8c..be88daba 100644 --- a/detect_secrets/util.py +++ b/detect_secrets/util.py @@ -26,23 +26,21 @@ def build_automaton(word_list): # See https://pyahocorasick.readthedocs.io/en/latest/ # for more information. automaton = ahocorasick.Automaton() - word_list_hash = '' + word_list_hash = hashlib.sha1() with open(word_list) as f: for line in f: # .lower() to make everything case-insensitive line = line.lower().strip() if len(line) > 3: - word_list_hash = hashlib.sha1( - (word_list_hash + line).encode('utf-8'), - ).hexdigest() + word_list_hash.update(line.encode('utf-8')) automaton.add_word(line, line) automaton.make_automaton() return ( automaton, - word_list_hash, + word_list_hash.hexdigest(), ) From 19d22da332c32b3744f696c5c80f354d5fc5f8f6 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 18:27:12 -0700 Subject: [PATCH 25/26] :snake: Inline leading . in constants.py --- detect_secrets/core/constants.py | 59 +++++++++++++++----------------- 1 file changed, 28 insertions(+), 31 deletions(-) diff --git a/detect_secrets/core/constants.py b/detect_secrets/core/constants.py index 556ea5ba..812f9353 100644 --- a/detect_secrets/core/constants.py +++ b/detect_secrets/core/constants.py @@ -9,37 +9,34 @@ # Definitely something to look into, if this list gets unruly long. IGNORED_FILE_EXTENSIONS = set( ( - '.' + extension - for extension in ( - '7z', - 'bmp', - 'bz2', - 'dmg', - 'eot', - 'exe', - 'gif', - 'gz', - 'ico', - 'jar', - 'jpg', - 'jpeg', - 'mo', - 'png', - 'rar', - 'realm', - 's7z', - 'svg', - 'tar', - 'tif', - 'tiff', - 'ttf', - 'webp', - 'woff', - 'xls', - 'xlsx', - 'zip', - ) - ) + '.7z', + '.bmp', + '.bz2', + '.dmg', + '.eot', + '.exe', + '.gif', + '.gz', + '.ico', + '.jar', + '.jpg', + '.jpeg', + '.mo', + '.png', + '.rar', + '.realm', + '.s7z', + '.svg', + '.tar', + '.tif', + '.tiff', + '.ttf', + '.webp', + '.woff', + '.xls', + '.xlsx', + '.zip', + ), ) From b440623b0109304c9900fa5efaa602ef5dec8be2 Mon Sep 17 00:00:00 2001 From: Kevin Hock Date: Mon, 23 Sep 2019 18:27:45 -0700 Subject: [PATCH 26/26] :snake: Remove useless WordListSupportedDetector class --- detect_secrets/plugins/base.py | 25 ------------------- .../plugins/high_entropy_strings.py | 6 ++--- detect_secrets/plugins/keyword.py | 7 +++--- 3 files changed, 7 insertions(+), 31 deletions(-) diff --git a/detect_secrets/plugins/base.py b/detect_secrets/plugins/base.py index bc654039..2e8bc8a2 100644 --- a/detect_secrets/plugins/base.py +++ b/detect_secrets/plugins/base.py @@ -239,28 +239,3 @@ def secret_generator(self, string, *args, **kwargs): for regex in self.denylist: for match in regex.findall(string): yield match - - -class WordListSupportedDetector(BasePlugin): - """Parent class for detectors supporting a word list. - - To create a new word list supported detector, subclass this - and pass `automaton` in __init__ like: - - class BarDetector(WordListSupportedDetector): - - secret_type = "bar" - - def __init__(self, automaton=None, **kwargs): - super(BarDetector, self).__init__( - automaton, - **kwargs - ) - ... - """ - __metaclass__ = ABCMeta - - def __init__(self, automaton=None, **kwargs): - super(WordListSupportedDetector, self).__init__(**kwargs) - - self.automaton = automaton diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index caedfc09..ee3ce363 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -14,7 +14,7 @@ import yaml -from .base import WordListSupportedDetector +from .base import BasePlugin from .common.filetype import determine_file_type from .common.filetype import FileType from .common.filters import is_false_positive @@ -23,7 +23,7 @@ from detect_secrets.core.potential_secret import PotentialSecret -class HighEntropyStringsPlugin(WordListSupportedDetector): +class HighEntropyStringsPlugin(BasePlugin): """Base class for string pattern matching""" __metaclass__ = ABCMeta @@ -38,11 +38,11 @@ def __init__(self, charset, limit, exclude_lines_regex, automaton, *args): self.charset = charset self.entropy_limit = limit + self.automaton = automaton self.regex = re.compile(r'([\'"])([%s]+)(\1)' % charset) super(HighEntropyStringsPlugin, self).__init__( exclude_lines_regex=exclude_lines_regex, - automaton=automaton, ) def analyze(self, file, filename): diff --git a/detect_secrets/plugins/keyword.py b/detect_secrets/plugins/keyword.py index d7526937..ec3edd17 100644 --- a/detect_secrets/plugins/keyword.py +++ b/detect_secrets/plugins/keyword.py @@ -28,7 +28,7 @@ import re -from .base import WordListSupportedDetector +from .base import BasePlugin from .common.filetype import determine_file_type from .common.filetype import FileType from .common.filters import is_false_positive @@ -247,7 +247,7 @@ } -class KeywordDetector(WordListSupportedDetector): +class KeywordDetector(BasePlugin): """This checks if denylisted keywords are present in the analyzed string. """ @@ -257,7 +257,6 @@ class KeywordDetector(WordListSupportedDetector): def __init__(self, keyword_exclude=None, exclude_lines_regex=None, automaton=None, **kwargs): super(KeywordDetector, self).__init__( exclude_lines_regex=exclude_lines_regex, - automaton=automaton, **kwargs ) @@ -268,6 +267,8 @@ def __init__(self, keyword_exclude=None, exclude_lines_regex=None, automaton=Non re.IGNORECASE, ) + self.automaton = automaton + def analyze_string_content(self, string, line_num, filename): output = {} if (