From 840832c6faa30793466c7029bdd98c72157b4f8c Mon Sep 17 00:00:00 2001 From: Dave Date: Tue, 5 Nov 2024 18:09:13 +0100 Subject: [PATCH] poc for secret string obfuscation in validate_dict --- dlt/common/exceptions.py | 8 ++++++- dlt/common/schema/utils.py | 8 +++++-- dlt/common/utils.py | 39 +++++++++++++++++++++++++++++++++ dlt/common/validation.py | 16 ++++++++++---- tests/common/test_validation.py | 16 ++++++++++++++ 5 files changed, 80 insertions(+), 7 deletions(-) diff --git a/dlt/common/exceptions.py b/dlt/common/exceptions.py index 6a86ab5fbe..0c897a7a46 100644 --- a/dlt/common/exceptions.py +++ b/dlt/common/exceptions.py @@ -1,5 +1,7 @@ from typing import Any, AnyStr, Dict, List, Sequence, Optional, Iterable, Type, TypedDict +from dlt.common.typing import StrAny + class ExceptionTrace(TypedDict, total=False): """Exception trace. NOTE: we intend to change it with an extended line by line trace with code snippets""" @@ -97,18 +99,22 @@ class DictValidationException(DltException): def __init__( self, msg: str, + doc: StrAny, path: str, expected_type: Type[Any] = None, field: str = None, value: Any = None, nested_exceptions: List["DictValidationException"] = None, ) -> None: + from dlt.common.utils import obfuscate_values_in_string + + self.doc = doc self.path = path self.expected_type = expected_type self.field = field self.value = value self.nested_exceptions = nested_exceptions - self.msg = msg + self.msg = obfuscate_values_in_string(doc, msg) super().__init__(msg) def __str__(self) -> str: diff --git a/dlt/common/schema/utils.py b/dlt/common/schema/utils.py index e2e1f959dc..929dca6c43 100644 --- a/dlt/common/schema/utils.py +++ b/dlt/common/schema/utils.py @@ -243,6 +243,7 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: if not isinstance(pv, str): raise DictValidationException( f"field {pk} value {pv} has invalid type {type(pv).__name__} while str is expected", + {}, path, t, pk, @@ -255,6 +256,7 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: except Exception as e: raise DictValidationException( f"field {pk} value {pv[3:]} does not compile as regex: {str(e)}", + {}, path, t, pk, @@ -264,6 +266,7 @@ def simple_regex_validator(path: str, pk: str, pv: Any, t: Any) -> bool: if RE_NON_ALPHANUMERIC_UNDERSCORE.match(pv): raise DictValidationException( f"field {pk} value {pv} looks like a regex, please prefix with re:", + {}, path, t, pk, @@ -283,6 +286,7 @@ def validator(path: str, pk: str, pv: Any, t: Any) -> bool: raise DictValidationException( f"field {pk} value {pv} has invalid type {type(pv).__name__} while" " str is expected", + {}, path, t, pk, @@ -291,11 +295,11 @@ def validator(path: str, pk: str, pv: Any, t: Any) -> bool: try: if naming.normalize_path(pv) != pv: raise DictValidationException( - f"field {pk}: {pv} is not a valid column name", path, t, pk, pv + f"field {pk}: {pv} is not a valid column name", {}, path, t, pk, pv ) except ValueError: raise DictValidationException( - f"field {pk}: {pv} is not a valid column name", path, t, pk, pv + f"field {pk}: {pv} is not a valid column name", {}, path, t, pk, pv ) return True else: diff --git a/dlt/common/utils.py b/dlt/common/utils.py index 170baf6f9f..ecb0c8f407 100644 --- a/dlt/common/utils.py +++ b/dlt/common/utils.py @@ -635,3 +635,42 @@ def __getattribute__(self, name: str) -> Any: raise RuntimeError("This instance has been dropped and cannot be used anymore.") return DefunctClass + + +def obfuscate_string(s: str) -> str: + """Obfuscates string by replacing some or all characters with asterisks""" + if len(s) < 6: + return "*" * len(s) + return s[0] + "*" * (len(s) - 2) + s[-1] + + +def all_strings(d: StrAny) -> List[str]: + """Returns all string values found in the given object""" + strings = [] + + if isinstance(d, str): + return [d] + if isinstance(d, dict): + for v in d.values(): + if isinstance(v, str): + strings.append(v) + elif isinstance(v, dict): + strings.extend(all_strings(v)) + elif isinstance(d, list): + for v in d: + strings.extend(all_strings(v)) + else: + return [] + return strings + + +def obfuscate_values_in_string(d: StrAny, msg: str) -> str: + """Obfuscates all string values found in the dictionary and its nested dictionaries in the message""" + + # create mapping of obfuscated strings + obfuscated_strings = {s: obfuscate_string(s) for s in all_strings(d)} + + for s in obfuscated_strings: + msg = msg.replace(s, obfuscated_strings[s]) + + return msg diff --git a/dlt/common/validation.py b/dlt/common/validation.py index 8862c10024..8d3527a978 100644 --- a/dlt/common/validation.py +++ b/dlt/common/validation.py @@ -65,11 +65,11 @@ def validate_dict( # check missing props missing = set(required_props.keys()).difference(props.keys()) if len(missing): - raise DictValidationException(f"following required fields are missing {missing}", path) + raise DictValidationException(f"following required fields are missing {missing}", doc, path) # check unknown props unexpected = set(props.keys()).difference(allowed_props.keys()) if len(unexpected): - raise DictValidationException(f"following fields are unexpected {unexpected}", path) + raise DictValidationException(f"following fields are unexpected {unexpected}", doc, path) def verify_prop(pk: str, pv: Any, t: Any) -> None: # covers none in optional and union types @@ -108,6 +108,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: msg += f"For {get_type_name(failed.expected_type)}: " + str(failed) + "\n" raise DictValidationException( msg, + doc, path, t, pk, @@ -118,13 +119,14 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: a_l = get_literal_args(t) if pv not in a_l: raise DictValidationException( - f"field '{pk}' with value {pv} is not one of: {a_l}", path, t, pk, pv + f"field '{pk}' with value {pv} is not one of: {a_l}", doc, path, t, pk, pv ) elif t in [int, bool, str, float]: if not isinstance(pv, t): raise DictValidationException( f"field '{pk}' with value {pv} has invalid type '{type(pv).__name__}' while" f" '{t.__name__}' is expected", + doc, path, t, pk, @@ -135,6 +137,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: raise DictValidationException( f"field '{pk}' with value {pv} has invalid type '{type(pv).__name__}' while" f" '{get_type_name(t)}' is expected", + doc, path, t, pk, @@ -146,6 +149,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: raise DictValidationException( f"field '{pk}' with value {pv} has invalid type '{type(pv).__name__}' while" " 'list' is expected", + doc, path, t, pk, @@ -160,6 +164,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: raise DictValidationException( f"field '{pk}' with value {pv} has invalid type '{type(pv).__name__}' while" " 'dict' is expected", + doc, path, t, pk, @@ -170,7 +175,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: for d_k, d_v in pv.items(): if not isinstance(d_k, str): raise DictValidationException( - f"field '{pk}' with key {d_k} must be a string", path, t, pk, d_k + f"field '{pk}' with key {d_k} must be a string", doc, path, t, pk, d_k ) verify_prop(f"{pk}[{d_k}]", d_v, d_v_t) elif t is Any: @@ -188,6 +193,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: raise DictValidationException( f"field '{pk}' expects callable (function or class instance) but got " f" '{pv}'. Mind that signatures are not validated", + doc, path, t, pk, @@ -203,6 +209,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: raise DictValidationException( f"field '{pk}' expects class '{type_name}' but got instance of" f" '{pv_type_name}'", + doc, path, t, pk, @@ -212,6 +219,7 @@ def verify_prop(pk: str, pv: Any, t: Any) -> None: type_name = get_type_name(t) raise DictValidationException( f"field '{pk}' has expected type '{type_name}' which lacks validator", + doc, path, t, pk, diff --git a/tests/common/test_validation.py b/tests/common/test_validation.py index 3f8ccfc20f..693fd9fb78 100644 --- a/tests/common/test_validation.py +++ b/tests/common/test_validation.py @@ -15,6 +15,7 @@ Optional, Union, ) +from dlt.common.configuration.specs import BaseConfiguration from dlt.common import Decimal, jsonpath from dlt.common.exceptions import DictValidationException @@ -406,6 +407,21 @@ class TTestRecordClassUnion(TypedDict): validate_dict(TTestRecordClassUnion, test_item_2, path=".") +def test_secrets_obfuscation() -> None: + class Config(TypedDict): + a: str + b: int + + with pytest.raises(DictValidationException) as e: + validate_dict(Config, {"a": "123456", "b": {"c": "inner_value"}}, ".") + assert "inner_value" not in e.value.msg + assert "123456" not in e.value.msg + # NOTE: message before this change was: + # "field 'b' with value {'c': 'inner_value'} has invalid type 'dict' while 'int' is expected" + # "inner_value" is now obfuscated + assert "{'c': 'i*********e'}" in e.value.msg + + # def test_union_merge() -> None: # """Overriding fields is simply illegal in TypedDict""" # class EndpointResource(TypedDict, total=False):