From 7e87ab3da8b2b1b54f5fd8dda3acb29c3131ff9e Mon Sep 17 00:00:00 2001 From: Oleksandr Yaremchuk Date: Tue, 23 Jul 2024 08:55:10 +0200 Subject: [PATCH] * lock presidio's version due to the breaking changes --- llm_guard/input_scanners/anonymize.py | 2 +- .../anonymize_helpers/analyzer.py | 2 +- .../anonymize_helpers/ner_mapping.py | 141 ------------------ .../transformers_recognizer.py | 9 +- llm_guard/output_scanners/sensitive.py | 8 +- pyproject.toml | 4 +- tests/input_scanners/test_secrets.py | 6 + 7 files changed, 17 insertions(+), 155 deletions(-) diff --git a/llm_guard/input_scanners/anonymize.py b/llm_guard/input_scanners/anonymize.py index dc5f7288..88f292b5 100644 --- a/llm_guard/input_scanners/anonymize.py +++ b/llm_guard/input_scanners/anonymize.py @@ -122,7 +122,7 @@ def __init__( recognizer=transformers_recognizer, regex_groups=get_regex_patterns(regex_patterns), custom_names=hidden_names, - supported_languages=list(set(["en", language])), + supported_languages=ALL_SUPPORTED_LANGUAGES, ) def _remove_conflicts_and_get_text_manipulation_data( diff --git a/llm_guard/input_scanners/anonymize_helpers/analyzer.py b/llm_guard/input_scanners/anonymize_helpers/analyzer.py index 4bfd2a88..684b642d 100644 --- a/llm_guard/input_scanners/anonymize_helpers/analyzer.py +++ b/llm_guard/input_scanners/anonymize_helpers/analyzer.py @@ -139,7 +139,7 @@ def get_analyzer( recognizer: EntityRecognizer, regex_groups: list[RegexPattern], custom_names: list[str], - supported_languages: list[str] = ["en"], + supported_languages: list[str], ) -> AnalyzerEngine: nlp_engine = _get_nlp_engine(languages=supported_languages) diff --git a/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py b/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py index 7dfd12ed..583bc1c9 100644 --- a/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py +++ b/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py @@ -8,7 +8,6 @@ class NERConfig(TypedDict): DEFAULT_MODEL: Model LABELS_TO_IGNORE: list[str] DEFAULT_EXPLANATION: str - DATASET_TO_PRESIDIO_MAPPING: dict[str, str] MODEL_TO_PRESIDIO_MAPPING: dict[str, str] CHUNK_OVERLAP_SIZE: int CHUNK_SIZE: int @@ -35,12 +34,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-base-NER NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "LOC": "LOCATION", - "ORG": "ORGANIZATION", - "PER": "PERSON", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "LOC": "LOCATION", @@ -72,12 +65,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-large-NER NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "LOC": "LOCATION", - "ORG": "ORGANIZATION", - "PER": "PERSON", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "LOC": "LOCATION", @@ -108,12 +95,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the gyr66/bert-base-chinese-finetuned-ner NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "address": "LOCATION", - "company": "ORGANIZATION", - "name": "PERSON", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "address": "LOCATION", @@ -153,36 +134,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "STREET": "LOCATION", - "CITY": "LOCATION", - "ZIPCODE": "LOCATION", - "BUILDINGNUMBER": "LOCATION", - "NEARBYGPSCOORDINATES": "LOCATION", - "SECONDARYADDRESS": "LOCATION", - "STATE": "LOCATION", - "COUNTY": "LOCATION", - "EMAIL": "EMAIL_ADDRESS", - "COMPANYNAME": "ORGANIZATION", - "PHONENUMBER": "PHONE_NUMBER", - "FIRSTNAME": "PERSON", - "LASTNAME": "PERSON", - "MIDDLENAME": "PERSON", - "CREDITCARDNUMBER": "CREDIT_CARD", - "ETHEREUMADDRESS": "CRYPTO", - "BITCOINADDRESS": "CRYPTO", - "LITECOINADDRESS": "CRYPTO", - "DATE": "DATE_TIME", - "TIME": "DATE_TIME", - "DOB": "DATE_OF_BIRTH", - "IBAN": "IBAN_CODE", - "IPV4": "IP_ADDRESS", - "IPV6": "IP_ADDRESS", - "IP": "IP_ADDRESS", - "URL": "URL", - "AGE": "AGE", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "STREET": "LOCATION", @@ -246,36 +197,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "STREET": "LOCATION", - "CITY": "LOCATION", - "ZIPCODE": "LOCATION", - "BUILDINGNUMBER": "LOCATION", - "NEARBYGPSCOORDINATES": "LOCATION", - "SECONDARYADDRESS": "LOCATION", - "STATE": "LOCATION", - "COUNTY": "LOCATION", - "EMAIL": "EMAIL_ADDRESS", - "COMPANYNAME": "ORGANIZATION", - "PHONENUMBER": "PHONE_NUMBER", - "FIRSTNAME": "PERSON", - "LASTNAME": "PERSON", - "MIDDLENAME": "PERSON", - "CREDITCARDNUMBER": "CREDIT_CARD", - "ETHEREUMADDRESS": "CRYPTO", - "BITCOINADDRESS": "CRYPTO", - "LITECOINADDRESS": "CRYPTO", - "DATE": "DATE_TIME", - "TIME": "DATE_TIME", - "DOB": "DATE_OF_BIRTH", - "IBAN": "IBAN_CODE", - "IPV4": "IP_ADDRESS", - "IPV6": "IP_ADDRESS", - "IP": "IP_ADDRESS", - "URL": "URL", - "AGE": "AGE", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "STREET": "LOCATION", @@ -339,36 +260,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "STREET": "LOCATION", - "CITY": "LOCATION", - "ZIPCODE": "LOCATION", - "BUILDINGNUMBER": "LOCATION", - "NEARBYGPSCOORDINATES": "LOCATION", - "SECONDARYADDRESS": "LOCATION", - "STATE": "LOCATION", - "COUNTY": "LOCATION", - "EMAIL": "EMAIL_ADDRESS", - "COMPANYNAME": "ORGANIZATION", - "PHONENUMBER": "PHONE_NUMBER", - "FIRSTNAME": "PERSON", - "LASTNAME": "PERSON", - "MIDDLENAME": "PERSON", - "CREDITCARDNUMBER": "CREDIT_CARD", - "ETHEREUMADDRESS": "CRYPTO", - "BITCOINADDRESS": "CRYPTO", - "LITECOINADDRESS": "CRYPTO", - "DATE": "DATE_TIME", - "TIME": "DATE_TIME", - "DOB": "DATE_OF_BIRTH", - "IBAN": "IBAN_CODE", - "IPV4": "IP_ADDRESS", - "IPV6": "IP_ADDRESS", - "IP": "IP_ADDRESS", - "URL": "URL", - "AGE": "AGE", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "STREET": "LOCATION", @@ -432,38 +323,6 @@ class NERConfig(TypedDict): ), "LABELS_TO_IGNORE": ["O", "CARDINAL"], "DEFAULT_EXPLANATION": "Identified as {} by the lakshyakh93/deberta_finetuned_pii NER model", - "DATASET_TO_PRESIDIO_MAPPING": { - "MISC": "O", - "BUILDINGNUMBER": "LOCATION", - "NEARBYGPSCOORDINATE": "LOCATION", - "STREET": "LOCATION", - "SECONDARYADDRESS": "LOCATION", - "PHONE_NUMBER": "PHONE_NUMBER", - "EMAIL": "EMAIL_ADDRESS", - "COMPANY_NAME": "ORGANIZATION", - "FIRSTNAME": "PERSON", - "FULLNAME": "PERSON", - "NAME": "PERSON", - "LASTNAME": "PERSON", - "MIDDLENAME": "PERSON", - "DATE": "DATE_TIME", - "TIME": "DATE_TIME", - "BITCOINADDRESS": "CRYPTO", - "URL": "URL", - "ETHEREUMADDRESS": "CRYPTO", - "IPV4": "IP_ADDRESS", - "IPV6": "IP_ADDRESS", - "CITY": "LOCATION", - "ZIPCODE": "LOCATION", - "STREETADDRESS": "LOCATION", - "CREDITCARDNUMBER": "CREDIT_CARD", - "STATE": "LOCATION", - "COUNTY": "LOCATION", - "SSN": "US_SSN", - "LITECOINADDRESS": "CRYPTO", - "IP": "IP_ADDRESS", - "IBAN": "IBAN_CODE", - }, "MODEL_TO_PRESIDIO_MAPPING": { "MISC": "O", "BUILDINGNUMBER": "LOCATION", diff --git a/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py b/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py index 6cbeac40..8c51dda9 100644 --- a/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py +++ b/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py @@ -55,7 +55,6 @@ class TransformersRecognizer(EntityRecognizer): ignore_labels: list[str] model_to_presidio_mapping: dict[str, str] - entity_mapping: dict[str, str] default_explanation: str text_overlap_length: int chunk_length: int @@ -95,7 +94,6 @@ def load_transformer( :param use_onnx: flag to use ONNX optimized model :type use_onnx: bool, optional :param kwargs: define default values for class attributes and modify pipeline behavior - **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format **MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk when splitting a single text into multiple inferences @@ -107,7 +105,6 @@ def load_transformer( **use_onnx (bool) - flag to use ONNX optimized model """ - self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {}) self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {}) self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"]) self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", "") @@ -177,11 +174,7 @@ def analyze( for res in ner_results: res["entity_group"] = self.__check_label_transformer(res["entity_group"]) - if not res["entity_group"]: - continue - - if res["entity_group"] not in entities: - LOGGER.debug("Ignoring entity", entity_group=res["entity_group"]) + if not res["entity_group"] or res["entity_group"] not in entities: continue if res["entity_group"] == self.id_entity_name: diff --git a/llm_guard/output_scanners/sensitive.py b/llm_guard/output_scanners/sensitive.py index a90b5252..0e498817 100644 --- a/llm_guard/output_scanners/sensitive.py +++ b/llm_guard/output_scanners/sensitive.py @@ -2,7 +2,11 @@ from presidio_anonymizer import AnonymizerEngine -from llm_guard.input_scanners.anonymize import DEFAULT_ENTITY_TYPES, Anonymize +from llm_guard.input_scanners.anonymize import ( + ALL_SUPPORTED_LANGUAGES, + DEFAULT_ENTITY_TYPES, + Anonymize, +) from llm_guard.input_scanners.anonymize_helpers import ( DEBERTA_AI4PRIVACY_v2_CONF, get_analyzer, @@ -70,7 +74,7 @@ def __init__( use_onnx=use_onnx, ) self._analyzer = get_analyzer( - transformers_recognizer, get_regex_patterns(regex_patterns), [] + transformers_recognizer, get_regex_patterns(regex_patterns), [], ALL_SUPPORTED_LANGUAGES ) self._anonymizer = AnonymizerEngine() diff --git a/pyproject.toml b/pyproject.toml index f481d481..33c0c192 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,8 +25,8 @@ dependencies = [ "fuzzysearch>=0.7,<0.9", "json-repair>=0.25.2,<0.26", "nltk>=3.8,<4", - "presidio-analyzer>=2.2,<3", - "presidio-anonymizer>=2.2,<3", + "presidio-analyzer==2.2.354", + "presidio-anonymizer==2.2.354", "regex==2024.5.15", "tiktoken>=0.5,<0.8", "torch>=2.0.1,<=2.3.1", diff --git a/tests/input_scanners/test_secrets.py b/tests/input_scanners/test_secrets.py index 921fc95d..39829cf3 100644 --- a/tests/input_scanners/test_secrets.py +++ b/tests/input_scanners/test_secrets.py @@ -36,6 +36,12 @@ False, 1.0, ), # Prompt with HTTP basic auth + ( + "Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa", + "Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa", + True, + 0.0, + ), # False-positive ], ) def test_scan(prompt, expected_prompt, expected_valid, expected_score):