Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prompt blocking #475

Open
wants to merge 12 commits into
base: pebblo-0.1.18
Choose a base branch
from
4 changes: 4 additions & 0 deletions pebblo/app/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@ class LoaderMetadata(BaseModel):
class AiDataModel(BaseModel):
data: Optional[Union[list, str]]
entityCount: int
entityDetails: Optional[dict] = dict()
entities: dict
topicCount: Optional[int] = None
topics: Optional[dict] = None
promptGovEnabled: Optional[bool] = None
promptBlocked: Optional[bool] = None

def dict(self, **kwargs):
kwargs["exclude_none"] = True
Expand Down Expand Up @@ -284,4 +287,5 @@ class PromptResponseModel(BaseModel):
class PromptGovResponseModel(BaseModel):
entities: dict
entityCount: int
entityDetails: dict
message: Optional[str] = None
5 changes: 4 additions & 1 deletion pebblo/app/service/doc_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,11 +182,14 @@ def _get_classifier_response(self, doc):
)
try:
if doc_info.data:
topics, topic_count = topic_classifier_obj.predict(doc_info.data)
topics, topic_count, topic_details = topic_classifier_obj.predict(
doc_info.data
)
(
entities,
entity_count,
anonymized_doc,
entity_details,
) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
doc_info.data,
anonymize_snippets=ClassifierConstants.anonymize_snippets.value,
Expand Down
15 changes: 11 additions & 4 deletions pebblo/app/service/prompt_gov.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,14 @@ def _get_classifier_response(self):
entities,
entity_count,
anonymized_doc,
entity_details,
) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
self.input.get("prompt"),
anonymize_snippets=False,
)
doc_info.entities = entities
doc_info.entityCount = entity_count
doc_info.entityDetails = entity_details
doc_info.data = anonymized_doc
return doc_info
except Exception as e:
Expand All @@ -62,11 +64,10 @@ def process_request(self):
"""
try:
doc_info = self._get_classifier_response()
logger.debug(f"Entities {doc_info.entities}")
logger.debug(f"Entity Count {doc_info.entityCount}")
response = PromptGovResponseModel(
entities=doc_info.entities,
entityCount=doc_info.entityCount,
entityDetails=doc_info.entityDetails,
message="Prompt Governance Processed Successfully",
)
return PebbloJsonResponse.build(
Expand All @@ -75,7 +76,10 @@ def process_request(self):

except ValidationError as ex:
response = PromptGovResponseModel(
entities={}, entityCount=0, message=f"Error : {str(ex)}"
entities={},
entityCount=0,
entityDetails={},
message=f"Error : {str(ex)}",
)
logger.error(
f"Error in Prompt API process_request. Error:{traceback.format_exc()}"
Expand All @@ -85,7 +89,10 @@ def process_request(self):
)
except Exception as ex:
response = PromptGovResponseModel(
entities={}, entityCount=0, message=f"Error : {str(ex)}"
entities={},
entityCount=0,
entityDetails={},
message=f"Error : {str(ex)}",
)
logger.error(
f"Error in Prompt API process_request. Error:{traceback.format_exc()}"
Expand Down
28 changes: 20 additions & 8 deletions pebblo/app/service/prompt_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,23 @@ def _fetch_classified_data(self, input_data, input_type=""):
entities,
entity_count,
_,
entity_details,
) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
input_data
)

data = {"data": input_data, "entityCount": entity_count, "entities": entities}
data = {
"data": input_data,
"entityCount": entity_count,
"entities": entities,
"entityDetails": entity_details,
}

# Topic classification is performed only for the response.
if input_type == "response":
topics, topic_count = self.topic_classifier_obj.predict(input_data)
topics, topic_count, topic_details = self.topic_classifier_obj.predict(
input_data
)
data["topicCount"] = topic_count
data["topics"] = topics

Expand Down Expand Up @@ -140,15 +148,19 @@ def process_request(self):
logger.debug("AI App prompt request processing started")

# getting prompt data
prompt_data = self._fetch_classified_data(
self.data.get("prompt", {}).get("data"), input_type="prompt"
)

prompt_data = self.data.get("prompt", {})
is_prompt_gov_enabled = self.data.get("prompt", {}).get(
"prompt_gov_enabled", False
"promptGovEnabled", None
gr8nishan marked this conversation as resolved.
Show resolved Hide resolved
)

if is_prompt_gov_enabled is False:
# Added for backward compatibility.
# Needs to be removed after pebblo 0.20
if is_prompt_gov_enabled is None:
is_prompt_gov_enabled = self.data.get("prompt", {}).get(
"prompt_gov_enabled"
)

if is_prompt_gov_enabled is None:
prompt_data = self._fetch_classified_data(
prompt_data.get("data", ""), input_type="prompt"
)
Expand Down
2 changes: 1 addition & 1 deletion pebblo/entity_classifier/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ from pebblo.entity_classifier.entity_classifier import EntityClassifier

text = <Input Data>
entity_classifier_obj = EntityClassifier()
entities, total_count, anonymized_text = entity_classifier_obj.presidio_entity_classifier_and_anonymizer(text,anonymize_snippets)
entities, total_count, anonymized_text, entity_details = entity_classifier_obj.presidio_entity_classifier_and_anonymizer(text,anonymize_snippets)
print(f"Entity Group: {entity_groups}")
print(f"Entity Count: {total_entity_count}")
print(f"Anonymized Text: {anonymized_text}")
Expand Down
110 changes: 90 additions & 20 deletions pebblo/entity_classifier/entity_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ConfidenceScore,
Entities,
SecretEntities,
entity_group_conf_mapping,
)
from pebblo.entity_classifier.utils.utils import (
add_custom_regex_analyzer_registry,
Expand Down Expand Up @@ -41,20 +42,47 @@ def custom_analyze(self):
)

def analyze_response(self, input_text, anonymize_all_entities=True):
# Returns analyzed output
"""
Analyze the given input text to detect and classify entities based on predefined criteria.

Args:
input_text (str): The text to be analyzed for detecting entities.
anonymize_all_entities (bool): Flag to determine if all detected entities should be anonymized.
(Currently not used in the function logic.)

Returns:
list: A list of detected entities that meet the criteria for classification.
"""
# Analyze the text to detect entities using the Presidio analyzer
analyzer_results = self.analyzer.analyze(text=input_text, language="en")
analyzer_results = [
result
for result in analyzer_results
if result.score >= float(ConfidenceScore.Entity.value)
]
if not anonymize_all_entities: # Condition for anonymized document
analyzer_results = [
result
for result in analyzer_results
if result.entity_type in self.entities
]
return analyzer_results
# Initialize the list to hold the final classified entities
final_results = []
# Iterate through the detected entities
for entity in analyzer_results:
try:
mapped_entity = None
# Map entity type to predefined entities if it exists in the Entities enumeration
if entity.entity_type in Entities.__members__:
mapped_entity = Entities[entity.entity_type].value
# Check if the entity type exists in SecretEntities enumeration
elif entity.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[entity.entity_type].value
# Append entity to final results if it meets the confidence threshold and is in the desired entities list

if (
mapped_entity
and entity.score
>= float(entity_group_conf_mapping[mapped_entity][0])
and entity.entity_type in self.entities
):
final_results.append(entity)
except Exception as ex:
logger.warning(
f"Error in analyze_response in entity classification. {str(ex)}"
)

# Return the list of classified entities that met the criteria
return final_results

def anonymize_response(self, analyzer_results, input_text):
# Returns anonymized output
Expand All @@ -64,17 +92,50 @@ def anonymize_response(self, analyzer_results, input_text):

return anonymized_text.items, anonymized_text.text

@staticmethod
def get_analyzed_entities_response(data, anonymized_response=None):
# Returns entities with its location i.e. start to end and confidence score
response = []
mapped_entity = None
for index, value in enumerate(data):
if value.entity_type in Entities.__members__:
mapped_entity = Entities[value.entity_type].value
elif value.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[value.entity_type].value

for index, value in enumerate(data):
mapped_entity = None
if value.entity_type in Entities.__members__:
mapped_entity = Entities[value.entity_type].value
elif value.entity_type in SecretEntities.__members__:
mapped_entity = SecretEntities[value.entity_type].value

location = f"{value.start}_{value.end}"
if anonymized_response:
anonymized_data = anonymized_response[len(data) - index - 1]
location = f"{anonymized_data.start}_{anonymized_data.end}"
response.append(
{
"entity_type": value.entity_type,
"location": location,
"confidence_score": value.score,
"entity_group": entity_group_conf_mapping[mapped_entity][1],
}
)
return response

def presidio_entity_classifier_and_anonymizer(
self, input_text, anonymize_snippets=False
):
"""
Perform classification on the input data and return a dictionary with the count of each entity group.
And also returns plain input text as anonymized text output
:param anonymize_snippets: Flag whether to anonymize snippets in report.
:param input_text: Input string / document snippet
:param anonymize_snippets: Flag whether to anonymize snippets in report.
:return: entities: containing the entity group Name as key and its count as value.
total_count: Total count of entity groupsInput text in anonymized form.
anonymized_text: Input text in anonymized form.
entity_details: Entities with its details such as location and confidence score.
Example:

input_text = " My SSN is 222-85-4836.
Expand All @@ -89,21 +150,30 @@ def presidio_entity_classifier_and_anonymizer(
"""
entities = {}
total_count = 0
anonymized_text = ""
try:
logger.debug("Presidio Entity Classifier and Anonymizer Started.")

analyzer_results = self.analyze_response(input_text)
anonymized_response, anonymized_text = self.anonymize_response(
analyzer_results, input_text
)

if anonymize_snippets: # If Document snippet needs to be anonymized
anonymized_response, anonymized_text = self.anonymize_response(
analyzer_results, input_text
)
input_text = anonymized_text.replace("<", "&lt;").replace(">", "&gt;")
entities, total_count = get_entities(self.entities, anonymized_response)
entities_response = self.get_analyzed_entities_response(
analyzer_results, anonymized_response
)
else:
entities_response = self.get_analyzed_entities_response(
analyzer_results
)
entities, entity_details, total_count = get_entities(
self.entities, entities_response
)
logger.debug("Presidio Entity Classifier and Anonymizer Finished")
logger.debug(f"Entities: {entities}")
logger.debug(f"Entity Total count: {total_count}")
return entities, total_count, input_text
return entities, total_count, input_text, entity_details
except Exception as e:
logger.error(
f"Presidio Entity Classifier and Anonymizer Failed, Exception: {e}"
Expand Down
33 changes: 33 additions & 0 deletions pebblo/entity_classifier/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ class Entities(Enum):
US_PASSPORT = "us-passport-number"
US_DRIVER_LICENSE = "us-drivers-license"

# network
IP_ADDRESS = "ip-address"

# Financial
CREDIT_CARD = "credit-card-number"
US_BANK_NUMBER = "us-bank-account-number"
Expand All @@ -38,6 +41,36 @@ class SecretEntities(Enum):
GOOGLE_API_KEY = "google-api-key"


class PIIGroups(Enum):
Identification = "pii-identification"
Financial = "pii-financial"
Secrets = "secrets_and_tokens"
Network = "pii-network"


entity_group_conf_mapping = {
# Identification
Entities.US_SSN.value: (0.8, PIIGroups.Identification.value),
Entities.US_PASSPORT.value: (0.4, PIIGroups.Identification.value),
Entities.US_DRIVER_LICENSE.value: (0.4, PIIGroups.Identification.value),
# Financial
Entities.US_ITIN.value: (0.8, PIIGroups.Financial.value),
Entities.CREDIT_CARD.value: (0.8, PIIGroups.Financial.value),
Entities.US_BANK_NUMBER.value: (0.4, PIIGroups.Financial.value),
Entities.IBAN_CODE.value: (0.8, PIIGroups.Financial.value),
# Secret
SecretEntities.GITHUB_TOKEN.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.SLACK_TOKEN.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AWS_ACCESS_KEY.value: (0.45, PIIGroups.Secrets.value),
SecretEntities.AWS_SECRET_KEY.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AZURE_KEY_ID.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.AZURE_CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value),
SecretEntities.GOOGLE_API_KEY.value: (0.8, PIIGroups.Secrets.value),
# Network
Entities.IP_ADDRESS.value: (0.4, PIIGroups.Network.value),
}


class ConfidenceScore(Enum):
Entity = "0.8" # based on this score entity output is finalized
EntityMinScore = "0.45" # It denotes the pattern's strength
Expand Down
2 changes: 1 addition & 1 deletion pebblo/entity_classifier/utils/regex_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@
"aws-access-key": r"""\b((?:AKIA|ABIA|ACCA|ASIA)[0-9A-Z]{16})\b""",
"aws-secret-key": r"""\b([A-Za-z0-9+/]{40})[ \r\n'"\x60]""",
"azure-key-id": r"""(?i)(%s).{0,20}([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})""",
"azure-client-secret": r"""(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})""",
"azure-client-secret": r"""\b(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})\b""",
"google-api-key": r"""(?i)(?:youtube)(?:.|[\n\r]){0,40}\bAIza[0-9A-Za-z\-_]{35}\b""",
}
Loading