Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added changes for prompt group #466

Merged
merged 8 commits into from
Aug 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions pebblo/entity_classifier/entity_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
ConfidenceScore,
Entities,
SecretEntities,
entity_conf_mapping,
entity_group_conf_mapping,
)
from pebblo.entity_classifier.utils.utils import (
add_custom_regex_analyzer_registry,
Expand Down Expand Up @@ -55,32 +55,27 @@ def analyze_response(self, input_text, anonymize_all_entities=True):
"""
# Analyze the text to detect entities using the Presidio analyzer
analyzer_results = self.analyzer.analyze(text=input_text, language="en")

# Initialize the list to hold the final classified entities
final_results = []

# Iterate through the detected entities
for entity in analyzer_results:
try:
mapped_entity = None

# Map entity type to predefined entities if it exists in the Entities enumeration
if entity.entity_type in Entities.__members__:
mapped_entity = Entities[entity.entity_type].value

# Check if the entity type exists in SecretEntities enumeration
elif entity.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[entity.entity_type].value

# Append entity to final results if it meets the confidence threshold and is in the desired entities list
if (
mapped_entity
and entity.score >= float(entity_conf_mapping[mapped_entity])
and entity.score
>= float(entity_group_conf_mapping[mapped_entity][0])
and entity.entity_type in self.entities
):
final_results.append(entity)

# Handle any exceptions that occur during entity classification
except Exception as ex:
logger.warning(
f"Error in analyze_response in entity classification. {str(ex)}"
Expand All @@ -101,8 +96,13 @@ def anonymize_response(self, analyzer_results, input_text):
def get_analyzed_entities_response(data, anonymized_response=None):
# Returns entities with its location i.e. start to end and confidence score
response = []

for index, value in enumerate(data):
mapped_entity = None
if value.entity_type in Entities.__members__:
mapped_entity = Entities[value.entity_type].value
elif value.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[value.entity_type].value

location = f"{value.start}_{value.end}"
if anonymized_response:
anonymized_data = anonymized_response[len(data) - index - 1]
Expand All @@ -112,6 +112,7 @@ def get_analyzed_entities_response(data, anonymized_response=None):
"entity_type": value.entity_type,
"location": location,
"confidence_score": value.score,
"entity_group": entity_group_conf_mapping[mapped_entity][1],
}
)
return response
Expand Down
42 changes: 27 additions & 15 deletions pebblo/entity_classifier/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class Entities(Enum):
US_PASSPORT = "us-passport-number"
US_DRIVER_LICENSE = "us-drivers-license"

# network
IP_ADDRESS = "ip-address"

# Financial
CREDIT_CARD = "credit-card-number"
US_BANK_NUMBER = "us-bank-account-number"
Expand All @@ -38,24 +41,33 @@ class SecretEntities(Enum):
GOOGLE_API_KEY = "google-api-key"


entity_conf_mapping = {
class PIIGroups(Enum):
Identification = "pii-identification"
Financial = "pii-financial"
Secrets = "secrets_and_tokens"
Network = "pii-network"


entity_group_conf_mapping = {
# Identification
Entities.US_SSN.value: 0.8,
Entities.US_PASSPORT.value: 0.4,
Entities.US_DRIVER_LICENSE.value: 0.4,
Entities.US_SSN.value: (0.8, PIIGroups.Identification.value),
Entities.US_PASSPORT.value: (0.4, PIIGroups.Identification.value),
Entities.US_DRIVER_LICENSE.value: (0.4, PIIGroups.Identification.value),
# Financial
Entities.US_ITIN.value: 0.8,
Entities.CREDIT_CARD.value: 0.8,
Entities.US_BANK_NUMBER.value: 0.4,
Entities.IBAN_CODE.value: 0.8,
Entities.US_ITIN.value: (0.8, PIIGroups.Financial.value),
Entities.CREDIT_CARD.value: (0.8, PIIGroups.Financial.value),
Entities.US_BANK_NUMBER.value: (0.4, PIIGroups.Financial.value),
Entities.IBAN_CODE.value: (0.8, PIIGroups.Financial.value),
# Secret
SecretEntities.GITHUB_TOKEN.value: 0.8,
SecretEntities.SLACK_TOKEN.value: 0.8,
SecretEntities.AWS_ACCESS_KEY.value: 0.45,
SecretEntities.AWS_SECRET_KEY.value: 0.8,
SecretEntities.AZURE_KEY_ID.value: 0.8,
SecretEntities.AZURE_CLIENT_SECRET.value: 0.8,
SecretEntities.GOOGLE_API_KEY.value: 0.8,
SecretEntities.GITHUB_TOKEN.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.SLACK_TOKEN.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AWS_ACCESS_KEY.value: (0.45, PIIGroups.Secrets.value),
SecretEntities.AWS_SECRET_KEY.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AZURE_KEY_ID.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AZURE_CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.GOOGLE_API_KEY.value: (0.8, PIIGroups.Secrets.value),
# Network
Entities.IP_ADDRESS.value: (0.4, PIIGroups.Network.value),
}


Expand Down
2 changes: 2 additions & 0 deletions pebblo/entity_classifier/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ConfidenceScore,
Entities,
SecretEntities,
entity_group_conf_mapping,
secret_entities_context_mapping,
)
from pebblo.entity_classifier.utils.regex_pattern import regex_secrets_patterns
Expand All @@ -32,6 +33,7 @@ def get_entities(entities_list, response):
"confidence_score": get_confidence_score_label(
entity["confidence_score"]
),
"entity_group": entity_group_conf_mapping[mapped_entity][1],
}
if mapped_entity in entity_details.keys():
entity_details[mapped_entity].append(entity_data)
Expand Down
1 change: 1 addition & 0 deletions tests/app/service/test_prompt_gov.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def test_process_request_success(mock_entity_classifier):
{
"location": "16_27",
"confidence_score": "HIGH",
"entity_group": "pii-identification",
}
]
},
Expand Down
1 change: 1 addition & 0 deletions tests/entity_classifier/mock_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,5 @@
Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345
<PERSON> - <SLACK_TOKEN>
Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY"
IP Address - <IP_ADDRESS>
"""
1 change: 1 addition & 0 deletions tests/entity_classifier/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345
Slack Token - xoxb-3204014939555-4519358291237-TTIf0243T8YFSAGEVr1wBrWE
Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY"
My IP Address - 10.55.60.61
"""

negative_data = """
Expand Down
Loading