Skip to content

Commit

Permalink
Merge branch 'main' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
asofter authored Jul 23, 2024
2 parents 038f793 + 7e87ab3 commit 35450b4
Show file tree
Hide file tree
Showing 7 changed files with 17 additions and 155 deletions.
2 changes: 1 addition & 1 deletion llm_guard/input_scanners/anonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def __init__(
recognizer=transformers_recognizer,
regex_groups=get_regex_patterns(regex_patterns),
custom_names=hidden_names,
supported_languages=list(set(["en", language])),
supported_languages=ALL_SUPPORTED_LANGUAGES,
)

def _remove_conflicts_and_get_text_manipulation_data(
Expand Down
2 changes: 1 addition & 1 deletion llm_guard/input_scanners/anonymize_helpers/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def get_analyzer(
recognizer: EntityRecognizer,
regex_groups: list[RegexPattern],
custom_names: list[str],
supported_languages: list[str] = ["en"],
supported_languages: list[str],
) -> AnalyzerEngine:
nlp_engine = _get_nlp_engine(languages=supported_languages)

Expand Down
141 changes: 0 additions & 141 deletions llm_guard/input_scanners/anonymize_helpers/ner_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@ class NERConfig(TypedDict):
DEFAULT_MODEL: Model
LABELS_TO_IGNORE: list[str]
DEFAULT_EXPLANATION: str
DATASET_TO_PRESIDIO_MAPPING: dict[str, str]
MODEL_TO_PRESIDIO_MAPPING: dict[str, str]
CHUNK_OVERLAP_SIZE: int
CHUNK_SIZE: int
Expand All @@ -35,12 +34,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-base-NER NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"LOC": "LOCATION",
"ORG": "ORGANIZATION",
"PER": "PERSON",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"LOC": "LOCATION",
Expand Down Expand Up @@ -72,12 +65,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the dslim/bert-large-NER NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"LOC": "LOCATION",
"ORG": "ORGANIZATION",
"PER": "PERSON",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"LOC": "LOCATION",
Expand Down Expand Up @@ -108,12 +95,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the gyr66/bert-base-chinese-finetuned-ner NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"address": "LOCATION",
"company": "ORGANIZATION",
"name": "PERSON",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"address": "LOCATION",
Expand Down Expand Up @@ -153,36 +134,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/distilbert_finetuned_ai4privacy_v2 NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"DOB": "DATE_OF_BIRTH",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
"AGE": "AGE",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
Expand Down Expand Up @@ -246,36 +197,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"DOB": "DATE_OF_BIRTH",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
"AGE": "AGE",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
Expand Down Expand Up @@ -339,36 +260,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the Isotonic/deberta-v3-base_finetuned_ai4privacy_v2 NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATES": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"EMAIL": "EMAIL_ADDRESS",
"COMPANYNAME": "ORGANIZATION",
"PHONENUMBER": "PHONE_NUMBER",
"FIRSTNAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"CREDITCARDNUMBER": "CREDIT_CARD",
"ETHEREUMADDRESS": "CRYPTO",
"BITCOINADDRESS": "CRYPTO",
"LITECOINADDRESS": "CRYPTO",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"DOB": "DATE_OF_BIRTH",
"IBAN": "IBAN_CODE",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"IP": "IP_ADDRESS",
"URL": "URL",
"AGE": "AGE",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"STREET": "LOCATION",
Expand Down Expand Up @@ -432,38 +323,6 @@ class NERConfig(TypedDict):
),
"LABELS_TO_IGNORE": ["O", "CARDINAL"],
"DEFAULT_EXPLANATION": "Identified as {} by the lakshyakh93/deberta_finetuned_pii NER model",
"DATASET_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"BUILDINGNUMBER": "LOCATION",
"NEARBYGPSCOORDINATE": "LOCATION",
"STREET": "LOCATION",
"SECONDARYADDRESS": "LOCATION",
"PHONE_NUMBER": "PHONE_NUMBER",
"EMAIL": "EMAIL_ADDRESS",
"COMPANY_NAME": "ORGANIZATION",
"FIRSTNAME": "PERSON",
"FULLNAME": "PERSON",
"NAME": "PERSON",
"LASTNAME": "PERSON",
"MIDDLENAME": "PERSON",
"DATE": "DATE_TIME",
"TIME": "DATE_TIME",
"BITCOINADDRESS": "CRYPTO",
"URL": "URL",
"ETHEREUMADDRESS": "CRYPTO",
"IPV4": "IP_ADDRESS",
"IPV6": "IP_ADDRESS",
"CITY": "LOCATION",
"ZIPCODE": "LOCATION",
"STREETADDRESS": "LOCATION",
"CREDITCARDNUMBER": "CREDIT_CARD",
"STATE": "LOCATION",
"COUNTY": "LOCATION",
"SSN": "US_SSN",
"LITECOINADDRESS": "CRYPTO",
"IP": "IP_ADDRESS",
"IBAN": "IBAN_CODE",
},
"MODEL_TO_PRESIDIO_MAPPING": {
"MISC": "O",
"BUILDINGNUMBER": "LOCATION",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ class TransformersRecognizer(EntityRecognizer):

ignore_labels: list[str]
model_to_presidio_mapping: dict[str, str]
entity_mapping: dict[str, str]
default_explanation: str
text_overlap_length: int
chunk_length: int
Expand Down Expand Up @@ -95,7 +94,6 @@ def load_transformer(
:param use_onnx: flag to use ONNX optimized model
:type use_onnx: bool, optional
:param kwargs: define default values for class attributes and modify pipeline behavior
**DATASET_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from dataset format to Presidio format
**MODEL_TO_PRESIDIO_MAPPING (dict) - defines mapping entity strings from chosen model format to Presidio format
**CHUNK_OVERLAP_SIZE (int) - number of overlapping characters in each text chunk
when splitting a single text into multiple inferences
Expand All @@ -107,7 +105,6 @@ def load_transformer(
**use_onnx (bool) - flag to use ONNX optimized model
"""

self.entity_mapping = kwargs.get("DATASET_TO_PRESIDIO_MAPPING", {})
self.model_to_presidio_mapping = kwargs.get("MODEL_TO_PRESIDIO_MAPPING", {})
self.ignore_labels = kwargs.get("LABELS_TO_IGNORE", ["O"])
self.default_explanation = kwargs.get("DEFAULT_EXPLANATION", "")
Expand Down Expand Up @@ -177,11 +174,7 @@ def analyze(

for res in ner_results:
res["entity_group"] = self.__check_label_transformer(res["entity_group"])
if not res["entity_group"]:
continue

if res["entity_group"] not in entities:
LOGGER.debug("Ignoring entity", entity_group=res["entity_group"])
if not res["entity_group"] or res["entity_group"] not in entities:
continue

if res["entity_group"] == self.id_entity_name:
Expand Down
8 changes: 6 additions & 2 deletions llm_guard/output_scanners/sensitive.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@

from presidio_anonymizer import AnonymizerEngine

from llm_guard.input_scanners.anonymize import DEFAULT_ENTITY_TYPES, Anonymize
from llm_guard.input_scanners.anonymize import (
ALL_SUPPORTED_LANGUAGES,
DEFAULT_ENTITY_TYPES,
Anonymize,
)
from llm_guard.input_scanners.anonymize_helpers import (
DEBERTA_AI4PRIVACY_v2_CONF,
get_analyzer,
Expand Down Expand Up @@ -70,7 +74,7 @@ def __init__(
use_onnx=use_onnx,
)
self._analyzer = get_analyzer(
transformers_recognizer, get_regex_patterns(regex_patterns), []
transformers_recognizer, get_regex_patterns(regex_patterns), [], ALL_SUPPORTED_LANGUAGES
)
self._anonymizer = AnonymizerEngine()

Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ dependencies = [
"fuzzysearch>=0.7,<0.9",
"json-repair>=0.25.2,<0.26",
"nltk>=3.8,<4",
"presidio-analyzer>=2.2,<3",
"presidio-anonymizer>=2.2,<3",
"presidio-analyzer==2.2.354",
"presidio-anonymizer==2.2.354",
"regex==2024.5.15",
"tiktoken>=0.5,<0.8",
"torch>=2.0.1,<=2.3.1",
Expand Down
6 changes: 6 additions & 0 deletions tests/input_scanners/test_secrets.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@
False,
1.0,
), # Prompt with HTTP basic auth
(
"Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa",
"Securely and attractively display eyewear, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass display, allow easy customer browsing,Lack of secure and appealing eyewear displays,Custom sunglass displa",
True,
0.0,
), # False-positive
],
)
def test_scan(prompt, expected_prompt, expected_valid, expected_score):
Expand Down

0 comments on commit 35450b4

Please sign in to comment.