Merge branch 'main' into patch-1

protectai · Jul 23, 2024 · 35450b4 · 35450b4
2 parents 038f793 + 7e87ab3
commit 35450b4
Show file tree

Hide file tree

Showing 7 changed files with 17 additions and 155 deletions.
diff --git a/llm_guard/input_scanners/anonymize.py b/llm_guard/input_scanners/anonymize.py
@@ -122,7 +122,7 @@ def __init__(
             recognizer=transformers_recognizer,
             regex_groups=get_regex_patterns(regex_patterns),
             custom_names=hidden_names,
-            supported_languages=list(set(["en", language])),
+            supported_languages=ALL_SUPPORTED_LANGUAGES,
         )
 
     def _remove_conflicts_and_get_text_manipulation_data(

diff --git a/llm_guard/input_scanners/anonymize_helpers/analyzer.py b/llm_guard/input_scanners/anonymize_helpers/analyzer.py
@@ -139,7 +139,7 @@ def get_analyzer(
     recognizer: EntityRecognizer,
     regex_groups: list[RegexPattern],
     custom_names: list[str],
-    supported_languages: list[str] = ["en"],
+    supported_languages: list[str],
 ) -> AnalyzerEngine:
     nlp_engine = _get_nlp_engine(languages=supported_languages)
 

diff --git a/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py b/llm_guard/input_scanners/anonymize_helpers/ner_mapping.py
@@ -8,7 +8,6 @@ class NERConfig(TypedDict):
     DEFAULT_MODEL: Model
     LABELS_TO_IGNORE: list[str]
     DEFAULT_EXPLANATION: str
-    DATASET_TO_PRESIDIO_MAPPING: dict[str, str]
     MODEL_TO_PRESIDIO_MAPPING: dict[str, str]
     CHUNK_OVERLAP_SIZE: int
     CHUNK_SIZE: int
@@ -35,12 +34,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-base-NER NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "LOC": "LOCATION",
-        "ORG": "ORGANIZATION",
-        "PER": "PERSON",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "LOC": "LOCATION",
@@ -72,12 +65,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-large-NER NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "LOC": "LOCATION",
-        "ORG": "ORGANIZATION",
-        "PER": "PERSON",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "LOC": "LOCATION",
@@ -108,12 +95,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the gyr66/bert-base-chinese-finetuned-ner NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "address": "LOCATION",
-        "company": "ORGANIZATION",
-        "name": "PERSON",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "address": "LOCATION",
@@ -153,36 +134,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "STREET": "LOCATION",
-        "CITY": "LOCATION",
-        "ZIPCODE": "LOCATION",
-        "BUILDINGNUMBER": "LOCATION",
-        "NEARBYGPSCOORDINATES": "LOCATION",
-        "SECONDARYADDRESS": "LOCATION",
-        "STATE": "LOCATION",
-        "COUNTY": "LOCATION",
-        "EMAIL": "EMAIL_ADDRESS",
-        "COMPANYNAME": "ORGANIZATION",
-        "PHONENUMBER": "PHONE_NUMBER",
-        "FIRSTNAME": "PERSON",
-        "LASTNAME": "PERSON",
-        "MIDDLENAME": "PERSON",
-        "CREDITCARDNUMBER": "CREDIT_CARD",
-        "ETHEREUMADDRESS": "CRYPTO",
-        "BITCOINADDRESS": "CRYPTO",
-        "LITECOINADDRESS": "CRYPTO",
-        "DATE": "DATE_TIME",
-        "TIME": "DATE_TIME",
-        "DOB": "DATE_OF_BIRTH",
-        "IBAN": "IBAN_CODE",
-        "IPV4": "IP_ADDRESS",
-        "IPV6": "IP_ADDRESS",
-        "IP": "IP_ADDRESS",
-        "URL": "URL",
-        "AGE": "AGE",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "STREET": "LOCATION",
@@ -246,36 +197,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "STREET": "LOCATION",
-        "CITY": "LOCATION",
-        "ZIPCODE": "LOCATION",
-        "BUILDINGNUMBER": "LOCATION",
-        "NEARBYGPSCOORDINATES": "LOCATION",
-        "SECONDARYADDRESS": "LOCATION",
-        "STATE": "LOCATION",
-        "COUNTY": "LOCATION",
-        "EMAIL": "EMAIL_ADDRESS",
-        "COMPANYNAME": "ORGANIZATION",
-        "PHONENUMBER": "PHONE_NUMBER",
-        "FIRSTNAME": "PERSON",
-        "LASTNAME": "PERSON",
-        "MIDDLENAME": "PERSON",
-        "CREDITCARDNUMBER": "CREDIT_CARD",
-        "ETHEREUMADDRESS": "CRYPTO",
-        "BITCOINADDRESS": "CRYPTO",
-        "LITECOINADDRESS": "CRYPTO",
-        "DATE": "DATE_TIME",
-        "TIME": "DATE_TIME",
-        "DOB": "DATE_OF_BIRTH",
-        "IBAN": "IBAN_CODE",
-        "IPV4": "IP_ADDRESS",
-        "IPV6": "IP_ADDRESS",
-        "IP": "IP_ADDRESS",
-        "URL": "URL",
-        "AGE": "AGE",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "STREET": "LOCATION",
@@ -339,36 +260,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "STREET": "LOCATION",
-        "CITY": "LOCATION",
-        "ZIPCODE": "LOCATION",
-        "BUILDINGNUMBER": "LOCATION",
-        "NEARBYGPSCOORDINATES": "LOCATION",
-        "SECONDARYADDRESS": "LOCATION",
-        "STATE": "LOCATION",
-        "COUNTY": "LOCATION",
-        "EMAIL": "EMAIL_ADDRESS",
-        "COMPANYNAME": "ORGANIZATION",
-        "PHONENUMBER": "PHONE_NUMBER",
-        "FIRSTNAME": "PERSON",
-        "LASTNAME": "PERSON",
-        "MIDDLENAME": "PERSON",
-        "CREDITCARDNUMBER": "CREDIT_CARD",
-        "ETHEREUMADDRESS": "CRYPTO",
-        "BITCOINADDRESS": "CRYPTO",
-        "LITECOINADDRESS": "CRYPTO",
-        "DATE": "DATE_TIME",
-        "TIME": "DATE_TIME",
-        "DOB": "DATE_OF_BIRTH",
-        "IBAN": "IBAN_CODE",
-        "IPV4": "IP_ADDRESS",
-        "IPV6": "IP_ADDRESS",
-        "IP": "IP_ADDRESS",
-        "URL": "URL",
-        "AGE": "AGE",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "STREET": "LOCATION",
@@ -432,38 +323,6 @@ class NERConfig(TypedDict):
     ),
     "LABELS_TO_IGNORE": ["O", "CARDINAL"],
     "DEFAULT_EXPLANATION": "Identified as {} by the lakshyakh93/deberta_finetuned_pii NER model",
-    "DATASET_TO_PRESIDIO_MAPPING": {
-        "MISC": "O",
-        "BUILDINGNUMBER": "LOCATION",
-        "NEARBYGPSCOORDINATE": "LOCATION",
-        "STREET": "LOCATION",
-        "SECONDARYADDRESS": "LOCATION",
-        "PHONE_NUMBER": "PHONE_NUMBER",
-        "EMAIL": "EMAIL_ADDRESS",
-        "COMPANY_NAME": "ORGANIZATION",
-        "FIRSTNAME": "PERSON",
-        "FULLNAME": "PERSON",
-        "NAME": "PERSON",
-        "LASTNAME": "PERSON",
-        "MIDDLENAME": "PERSON",
-        "DATE": "DATE_TIME",
-        "TIME": "DATE_TIME",
-        "BITCOINADDRESS": "CRYPTO",
-        "URL": "URL",
-        "ETHEREUMADDRESS": "CRYPTO",
-        "IPV4": "IP_ADDRESS",
-        "IPV6": "IP_ADDRESS",
-        "CITY": "LOCATION",
-        "ZIPCODE": "LOCATION",
-        "STREETADDRESS": "LOCATION",
-        "CREDITCARDNUMBER": "CREDIT_CARD",
-        "STATE": "LOCATION",
-        "COUNTY": "LOCATION",
-        "SSN": "US_SSN",
-        "LITECOINADDRESS": "CRYPTO",
-        "IP": "IP_ADDRESS",
-        "IBAN": "IBAN_CODE",
-    },
     "MODEL_TO_PRESIDIO_MAPPING": {
         "MISC": "O",
         "BUILDINGNUMBER": "LOCATION",

diff --git a/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py b/llm_guard/input_scanners/anonymize_helpers/transformers_recognizer.py
@@ -55,7 +55,6 @@ class TransformersRecognizer(EntityRecognizer):
 
     ignore_labels: list[str]
     model_to_presidio_mapping: dict[str, str]
-    entity_mapping: dict[str, str]
     default_explanation: str
     text_overlap_length: int
     chunk_length: int
@@ -95,7 +94,6 @@ def load_transformer(
         :param use_onnx: flag to use ONNX optimized model
         :type use_onnx: bool, optional
         :param kwargs: define default values for class attributes and modify pipeline behavior
-        **DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
         **MODEL_TO_PRESIDIO_MAPPING (dict) -  defines mapping entity strings from chosen model format to Presidio format
         **CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk
         when splitting a single text into multiple inferences
@@ -107,7 +105,6 @@ def load_transformer(
         **use_onnx (bool) - flag to use ONNX optimized model
         """
 
-        self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
         self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
         self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
         self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", "")
@@ -177,11 +174,7 @@ def analyze(
 
         for res in ner_results:
             res["entity_group"] = self.__check_label_transformer(res["entity_group"])
-            if not res["entity_group"]:
-                continue
-
-            if res["entity_group"] not in entities:
-                LOGGER.debug("Ignoring entity", entity_group=res["entity_group"])
+            if not res["entity_group"] or res["entity_group"] not in entities:
                 continue
 
             if res["entity_group"] == self.id_entity_name:

diff --git a/llm_guard/output_scanners/sensitive.py b/llm_guard/output_scanners/sensitive.py
@@ -2,7 +2,11 @@
 
 from presidio_anonymizer import AnonymizerEngine
 
-from llm_guard.input_scanners.anonymize import DEFAULT_ENTITY_TYPES, Anonymize
+from llm_guard.input_scanners.anonymize import (
+    ALL_SUPPORTED_LANGUAGES,
+    DEFAULT_ENTITY_TYPES,
+    Anonymize,
+)
 from llm_guard.input_scanners.anonymize_helpers import (
     DEBERTA_AI4PRIVACY_v2_CONF,
     get_analyzer,
@@ -70,7 +74,7 @@ def __init__(
             use_onnx=use_onnx,
         )
         self._analyzer = get_analyzer(
-            transformers_recognizer, get_regex_patterns(regex_patterns), []
+            transformers_recognizer, get_regex_patterns(regex_patterns), [], ALL_SUPPORTED_LANGUAGES
         )
         self._anonymizer = AnonymizerEngine()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -25,8 +25,8 @@ dependencies = [
   "fuzzysearch>=0.7,<0.9",
   "json-repair>=0.25.2,<0.26",
   "nltk>=3.8,<4",
-  "presidio-analyzer>=2.2,<3",
-  "presidio-anonymizer>=2.2,<3",
+  "presidio-analyzer==2.2.354",
+  "presidio-anonymizer==2.2.354",
   "regex==2024.5.15",
   "tiktoken>=0.5,<0.8",
   "torch>=2.0.1,<=2.3.1",

diff --git a/tests/input_scanners/test_secrets.py b/tests/input_scanners/test_secrets.py
@@ -36,6 +36,12 @@
             False,
             1.0,
         ),  # Prompt with HTTP basic auth
+        (
+            "Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa",
+            "Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa",
+            True,
+            0.0,
+        ),  # False-positive
     ],
 )
 def test_scan(prompt, expected_prompt, expected_valid, expected_score):