daxa-ai · gr8nishan · Aug 7, 2024 · Aug 7, 2024 · Aug 7, 2024 · Aug 12, 2024
diff --git a/pebblo/app/models/models.py b/pebblo/app/models/models.py
@@ -25,9 +25,12 @@ class LoaderMetadata(BaseModel):
 class AiDataModel(BaseModel):
     data: Optional[Union[list, str]]
     entityCount: int
+    entityDetails: Optional[dict] = dict()
     entities: dict
     topicCount: Optional[int] = None
     topics: Optional[dict] = None
+    promptGovEnabled: Optional[bool] = None
+    promptBlocked: Optional[bool] = None
 
     def dict(self, **kwargs):
         kwargs["exclude_none"] = True
@@ -284,4 +287,5 @@ class PromptResponseModel(BaseModel):
 class PromptGovResponseModel(BaseModel):
     entities: dict
     entityCount: int
+    entityDetails: dict
     message: Optional[str] = None
diff --git a/pebblo/app/service/doc_helper.py b/pebblo/app/service/doc_helper.py
@@ -182,11 +182,14 @@ def _get_classifier_response(self, doc):
         )
         try:
             if doc_info.data:
-                topics, topic_count = topic_classifier_obj.predict(doc_info.data)
+                topics, topic_count, topic_details = topic_classifier_obj.predict(
+                    doc_info.data
+                )
                 (
                     entities,
                     entity_count,
                     anonymized_doc,
+                    entity_details,
                 ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
                     doc_info.data,
                     anonymize_snippets=ClassifierConstants.anonymize_snippets.value,

diff --git a/pebblo/app/service/prompt_gov.py b/pebblo/app/service/prompt_gov.py
@@ -44,12 +44,14 @@ def _get_classifier_response(self):
                     entities,
                     entity_count,
                     anonymized_doc,
+                    entity_details,
                 ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
                     self.input.get("prompt"),
                     anonymize_snippets=False,
                 )
                 doc_info.entities = entities
                 doc_info.entityCount = entity_count
+                doc_info.entityDetails = entity_details
                 doc_info.data = anonymized_doc
             return doc_info
         except Exception as e:
@@ -62,11 +64,10 @@ def process_request(self):
         """
         try:
             doc_info = self._get_classifier_response()
-            logger.debug(f"Entities {doc_info.entities}")
-            logger.debug(f"Entity Count {doc_info.entityCount}")
             response = PromptGovResponseModel(
                 entities=doc_info.entities,
                 entityCount=doc_info.entityCount,
+                entityDetails=doc_info.entityDetails,
                 message="Prompt Governance Processed Successfully",
             )
             return PebbloJsonResponse.build(
@@ -75,7 +76,10 @@ def process_request(self):
 
         except ValidationError as ex:
             response = PromptGovResponseModel(
-                entities={}, entityCount=0, message=f"Error : {str(ex)}"
+                entities={},
+                entityCount=0,
+                entityDetails={},
+                message=f"Error : {str(ex)}",
             )
             logger.error(
                 f"Error in Prompt API process_request. Error:{traceback.format_exc()}"
@@ -85,7 +89,10 @@ def process_request(self):
             )
         except Exception as ex:
             response = PromptGovResponseModel(
-                entities={}, entityCount=0, message=f"Error : {str(ex)}"
+                entities={},
+                entityCount=0,
+                entityDetails={},
+                message=f"Error : {str(ex)}",
             )
             logger.error(
                 f"Error in Prompt API process_request. Error:{traceback.format_exc()}"

diff --git a/pebblo/app/service/prompt_service.py b/pebblo/app/service/prompt_service.py
@@ -45,15 +45,23 @@ def _fetch_classified_data(self, input_data, input_type=""):
             entities,
             entity_count,
             _,
+            entity_details,
         ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
             input_data
         )
 
-        data = {"data": input_data, "entityCount": entity_count, "entities": entities}
+        data = {
+            "data": input_data,
+            "entityCount": entity_count,
+            "entities": entities,
+            "entityDetails": entity_details,
+        }
 
         # Topic classification is performed only for the response.
         if input_type == "response":
-            topics, topic_count = self.topic_classifier_obj.predict(input_data)
+            topics, topic_count, topic_details = self.topic_classifier_obj.predict(
+                input_data
+            )
             data["topicCount"] = topic_count
             data["topics"] = topics
 
@@ -140,15 +148,19 @@ def process_request(self):
             logger.debug("AI App prompt request processing started")
 
             # getting prompt data
-            prompt_data = self._fetch_classified_data(
-                self.data.get("prompt", {}).get("data"), input_type="prompt"
-            )
-
+            prompt_data = self.data.get("prompt", {})
             is_prompt_gov_enabled = self.data.get("prompt", {}).get(
-                "prompt_gov_enabled", False
+                "promptGovEnabled", None
             )
 
-            if is_prompt_gov_enabled is False:
+            # Added for backward compatibility.
+            # Needs to be removed after pebblo 0.20
+            if is_prompt_gov_enabled is None:
+                is_prompt_gov_enabled = self.data.get("prompt", {}).get(
+                    "prompt_gov_enabled"
+                )
+
+            if is_prompt_gov_enabled is None:
                 prompt_data = self._fetch_classified_data(
                     prompt_data.get("data", ""), input_type="prompt"
                 )

diff --git a/pebblo/entity_classifier/README.md b/pebblo/entity_classifier/README.md
@@ -24,7 +24,7 @@ from pebblo.entity_classifier.entity_classifier import EntityClassifier
 
 text = <Input Data>
 entity_classifier_obj = EntityClassifier()
-entities, total_count, anonymized_text = entity_classifier_obj.presidio_entity_classifier_and_anonymizer(text,anonymize_snippets)
+entities, total_count, anonymized_text, entity_details = entity_classifier_obj.presidio_entity_classifier_and_anonymizer(text,anonymize_snippets)
 print(f"Entity Group: {entity_groups}")
 print(f"Entity Count: {total_entity_count}")
 print(f"Anonymized Text: {anonymized_text}")

diff --git a/pebblo/entity_classifier/entity_classifier.py b/pebblo/entity_classifier/entity_classifier.py
@@ -6,6 +6,7 @@
     ConfidenceScore,
     Entities,
     SecretEntities,
+    entity_group_conf_mapping,
 )
 from pebblo.entity_classifier.utils.utils import (
     add_custom_regex_analyzer_registry,
@@ -41,20 +42,47 @@ def custom_analyze(self):
         )
 
     def analyze_response(self, input_text, anonymize_all_entities=True):
-        # Returns analyzed output
+        """
+        Analyze the given input text to detect and classify entities based on predefined criteria.
+
+        Args:
+            input_text (str): The text to be analyzed for detecting entities.
+            anonymize_all_entities (bool): Flag to determine if all detected entities should be anonymized.
+                                            (Currently not used in the function logic.)
+
+        Returns:
+            list: A list of detected entities that meet the criteria for classification.
+        """
+        # Analyze the text to detect entities using the Presidio analyzer
         analyzer_results = self.analyzer.analyze(text=input_text, language="en")
-        analyzer_results = [
-            result
-            for result in analyzer_results
-            if result.score >= float(ConfidenceScore.Entity.value)
-        ]
-        if not anonymize_all_entities:  # Condition for anonymized document
-            analyzer_results = [
-                result
-                for result in analyzer_results
-                if result.entity_type in self.entities
-            ]
-        return analyzer_results
+        # Initialize the list to hold the final classified entities
+        final_results = []
+        # Iterate through the detected entities
+        for entity in analyzer_results:
+            try:
+                mapped_entity = None
+                # Map entity type to predefined entities if it exists in the Entities enumeration
+                if entity.entity_type in Entities.__members__:
+                    mapped_entity = Entities[entity.entity_type].value
+                # Check if the entity type exists in SecretEntities enumeration
+                elif entity.entity_type in SecretEntities.__members__:
+                    mapped_entity = SecretEntities[entity.entity_type].value
+                # Append entity to final results if it meets the confidence threshold and is in the desired entities list
+
+                if (
+                    mapped_entity
+                    and entity.score
+                    >= float(entity_group_conf_mapping[mapped_entity][0])
+                    and entity.entity_type in self.entities
+                ):
+                    final_results.append(entity)
+            except Exception as ex:
+                logger.warning(
+                    f"Error in analyze_response in entity classification. {str(ex)}"
+                )
+
+        # Return the list of classified entities that met the criteria
+        return final_results
 
     def anonymize_response(self, analyzer_results, input_text):
         # Returns anonymized output
@@ -64,17 +92,50 @@ def anonymize_response(self, analyzer_results, input_text):
 
         return anonymized_text.items, anonymized_text.text
 
+    @staticmethod
+    def get_analyzed_entities_response(data, anonymized_response=None):
+        # Returns entities with its location i.e. start to end and confidence score
+        response = []
+        mapped_entity = None
+        for index, value in enumerate(data):
+            if value.entity_type in Entities.__members__:
+                mapped_entity = Entities[value.entity_type].value
+            elif value.entity_type in SecretEntities.__members__:
+                mapped_entity = SecretEntities[value.entity_type].value
+
+        for index, value in enumerate(data):
+            mapped_entity = None
+            if value.entity_type in Entities.__members__:
+                mapped_entity = Entities[value.entity_type].value
+            elif value.entity_type in SecretEntities.__members__:
+                mapped_entity = SecretEntities[value.entity_type].value
+
+            location = f"{value.start}_{value.end}"
+            if anonymized_response:
+                anonymized_data = anonymized_response[len(data) - index - 1]
+                location = f"{anonymized_data.start}_{anonymized_data.end}"
+            response.append(
+                {
+                    "entity_type": value.entity_type,
+                    "location": location,
+                    "confidence_score": value.score,
+                    "entity_group": entity_group_conf_mapping[mapped_entity][1],
+                }
+            )
+        return response
+
     def presidio_entity_classifier_and_anonymizer(
         self, input_text, anonymize_snippets=False
     ):
         """
         Perform classification on the input data and return a dictionary with the count of each entity group.
         And also returns plain input text as anonymized text output
-        :param anonymize_snippets: Flag whether to anonymize snippets in report.
         :param input_text: Input string / document snippet
+        :param anonymize_snippets: Flag whether to anonymize snippets in report.
         :return: entities: containing the entity group Name as key and its count as value.
                  total_count: Total count of entity groupsInput text in anonymized form.
                  anonymized_text: Input text in anonymized form.
+                 entity_details: Entities with its details such as location and confidence score.
         Example:
 
         input_text = " My SSN is 222-85-4836.
@@ -89,21 +150,30 @@ def presidio_entity_classifier_and_anonymizer(
         """
         entities = {}
         total_count = 0
-        anonymized_text = ""
         try:
             logger.debug("Presidio Entity Classifier and Anonymizer Started.")
 
             analyzer_results = self.analyze_response(input_text)
-            anonymized_response, anonymized_text = self.anonymize_response(
-                analyzer_results, input_text
-            )
+
             if anonymize_snippets:  # If Document snippet needs to be anonymized
+                anonymized_response, anonymized_text = self.anonymize_response(
+                    analyzer_results, input_text
+                )
                 input_text = anonymized_text.replace("<", "&lt;").replace(">", "&gt;")
-            entities, total_count = get_entities(self.entities, anonymized_response)
+                entities_response = self.get_analyzed_entities_response(
+                    analyzer_results, anonymized_response
+                )
+            else:
+                entities_response = self.get_analyzed_entities_response(
+                    analyzer_results
+                )
+            entities, entity_details, total_count = get_entities(
+                self.entities, entities_response
+            )
             logger.debug("Presidio Entity Classifier and Anonymizer Finished")
             logger.debug(f"Entities: {entities}")
             logger.debug(f"Entity Total count: {total_count}")
-            return entities, total_count, input_text
+            return entities, total_count, input_text, entity_details
         except Exception as e:
             logger.error(
                 f"Presidio Entity Classifier and Anonymizer Failed, Exception: {e}"

diff --git a/pebblo/entity_classifier/utils/config.py b/pebblo/entity_classifier/utils/config.py
@@ -21,6 +21,9 @@ class Entities(Enum):
     US_PASSPORT = "us-passport-number"
     US_DRIVER_LICENSE = "us-drivers-license"
 
+    # network
+    IP_ADDRESS = "ip-address"
+
     # Financial
     CREDIT_CARD = "credit-card-number"
     US_BANK_NUMBER = "us-bank-account-number"
@@ -38,6 +41,36 @@ class SecretEntities(Enum):
     GOOGLE_API_KEY = "google-api-key"
 
 
+class PIIGroups(Enum):
+    Identification = "pii-identification"
+    Financial = "pii-financial"
+    Secrets = "secrets_and_tokens"
+    Network = "pii-network"
+
+
+entity_group_conf_mapping = {
+    # Identification
+    Entities.US_SSN.value: (0.8, PIIGroups.Identification.value),
+    Entities.US_PASSPORT.value: (0.4, PIIGroups.Identification.value),
+    Entities.US_DRIVER_LICENSE.value: (0.4, PIIGroups.Identification.value),
+    # Financial
+    Entities.US_ITIN.value: (0.8, PIIGroups.Financial.value),
+    Entities.CREDIT_CARD.value: (0.8, PIIGroups.Financial.value),
+    Entities.US_BANK_NUMBER.value: (0.4, PIIGroups.Financial.value),
+    Entities.IBAN_CODE.value: (0.8, PIIGroups.Financial.value),
+    # Secret
+    SecretEntities.GITHUB_TOKEN.value: (0.8, PIIGroups.Secrets.value),
+    SecretEntities.SLACK_TOKEN.value: (0.8, PIIGroups.Secrets.value),
+    SecretEntities.AWS_ACCESS_KEY.value: (0.45, PIIGroups.Secrets.value),
+    SecretEntities.AWS_SECRET_KEY.value: (0.8, PIIGroups.Secrets.value),
+    SecretEntities.AZURE_KEY_ID.value: (0.8, PIIGroups.Secrets.value),
+    SecretEntities.AZURE_CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value),
+    SecretEntities.GOOGLE_API_KEY.value: (0.8, PIIGroups.Secrets.value),
+    # Network
+    Entities.IP_ADDRESS.value: (0.4, PIIGroups.Network.value),
+}
+
+
 class ConfidenceScore(Enum):
     Entity = "0.8"  # based on this score entity output is finalized
     EntityMinScore = "0.45"  # It denotes the pattern's strength

diff --git a/pebblo/entity_classifier/utils/regex_pattern.py b/pebblo/entity_classifier/utils/regex_pattern.py
@@ -11,6 +11,6 @@
     "aws-access-key": r"""\b((?:AKIA|ABIA|ACCA|ASIA)[0-9A-Z]{16})\b""",
     "aws-secret-key": r"""\b([A-Za-z0-9+/]{40})[ \r\n'"\x60]""",
     "azure-key-id": r"""(?i)(%s).{0,20}([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})""",
-    "azure-client-secret": r"""(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})""",
+    "azure-client-secret": r"""\b(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})\b""",
     "google-api-key": r"""(?i)(?:youtube)(?:.|[\n\r]){0,40}\bAIza[0-9A-Za-z\-_]{35}\b""",
 }