From 703b5bcfc38e40f22eab17544d686b30cee315bb Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Thu, 12 Sep 2024 12:16:28 +0530 Subject: [PATCH 1/3] Adding classifier mode changes for file type storage --- pebblo/app/api/req_models.py | 1 + pebblo/app/service/doc_helper.py | 50 +++++++++++++++++----------- pebblo/app/service/service.py | 13 +++++++- tests/app/service/test_loader_doc.py | 2 +- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/pebblo/app/api/req_models.py b/pebblo/app/api/req_models.py index d7df4fbf..0e4cfab1 100644 --- a/pebblo/app/api/req_models.py +++ b/pebblo/app/api/req_models.py @@ -65,6 +65,7 @@ class ReqLoaderDoc(BaseModel): loading_end: bool source_owner: str classifier_location: str + classifier_mode: Optional[str] = None class Context(BaseModel): diff --git a/pebblo/app/service/doc_helper.py b/pebblo/app/service/doc_helper.py index 8aef41e5..a823769c 100644 --- a/pebblo/app/service/doc_helper.py +++ b/pebblo/app/service/doc_helper.py @@ -6,6 +6,7 @@ import os.path from datetime import datetime +from pebblo.app.enums.common import ClassificationMode from pebblo.app.enums.enums import CacheDir, ClassifierConstants, ReportConstants from pebblo.app.models.models import ( AiDataModel, @@ -36,11 +37,12 @@ class LoaderHelper: Class for loader doc related task """ - def __init__(self, app_details, data, load_id): + def __init__(self, app_details, data, load_id, classifier_mode): self.app_details = app_details self.data = data self.load_id = load_id self.loader_mapper = {} + self.classifier_mode = classifier_mode self.entity_classifier_obj = EntityClassifier() # Initialization @@ -186,25 +188,33 @@ def _get_classifier_response(self, doc): ) try: if doc_info.data: - topics, topic_count, topic_details = topic_classifier_obj.predict( - doc_info.data - ) - ( - entities, - entity_count, - anonymized_doc, - entity_details, - ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer( - doc_info.data, - anonymize_snippets=ClassifierConstants.anonymize_snippets.value, - ) - doc_info.topics = topics - doc_info.entities = entities - doc_info.entityDetails = entity_details - doc_info.topicDetails = topic_details - doc_info.topicCount = topic_count - doc_info.entityCount = entity_count - doc_info.data = anonymized_doc + if self.classifier_mode and self.classifier_mode in [ + ClassificationMode.ALL.value, + ClassificationMode.TOPIC.value, + ]: + topics, topic_count, topic_details = topic_classifier_obj.predict( + doc_info.data + ) + doc_info.topics = topics + doc_info.topicDetails = topic_details + doc_info.topicCount = topic_count + if self.classifier_mode and self.classifier_mode in [ + ClassificationMode.ALL.value, + ClassificationMode.ENTITY.value, + ]: + ( + entities, + entity_count, + anonymized_doc, + entity_details, + ) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer( + doc_info.data, + anonymize_snippets=ClassifierConstants.anonymize_snippets.value, + ) + doc_info.entities = entities + doc_info.entityDetails = entity_details + doc_info.entityCount = entity_count + doc_info.data = anonymized_doc return doc_info except Exception as e: logger.error(f"Get Classifier Response Failed, Exception: {e}") diff --git a/pebblo/app/service/service.py b/pebblo/app/service/service.py index e0072cd3..b28aba80 100644 --- a/pebblo/app/service/service.py +++ b/pebblo/app/service/service.py @@ -5,8 +5,10 @@ import hashlib from datetime import datetime +from pebblo.app.config.config import var_server_config_dict from pydantic import ValidationError +from pebblo.app.enums.common import ClassificationMode from pebblo.app.enums.enums import CacheDir from pebblo.app.libs.responses import PebbloJsonResponse from pebblo.app.models.models import LoaderDocResponseModel, LoaderDocs, LoaderMetadata @@ -15,6 +17,7 @@ from pebblo.log import get_logger from pebblo.reports.reports import Reports +config_details = var_server_config_dict.get() logger = get_logger(__name__) @@ -26,6 +29,7 @@ class AppLoaderDoc: def __init__(self): self.data = None self.app_name = None + self.classifier_mode = None def _initialize_data(self, data): self.data = data @@ -122,6 +126,13 @@ def process_request(self, data): """ This process is entrypoint function for loader doc API implementation. """ + if not data.get("classifier_mode"): + self.classifier_mode = config_details.get("classifier", {}).get( + "mode", ClassificationMode.ALL.value + ) + else: + self.classifier_mode = data.get("classifier_mode") + self._initialize_data(data) try: @@ -161,7 +172,7 @@ def process_request(self, data): self._upsert_loader_details(app_details) # process input docs, app details, and generate final report - loader_helper_obj = LoaderHelper(app_details, self.data, load_id) + loader_helper_obj = LoaderHelper(app_details, self.data, load_id, self.classifier_mode) ( app_details, final_report, diff --git a/tests/app/service/test_loader_doc.py b/tests/app/service/test_loader_doc.py index c7d03dde..92acd35f 100644 --- a/tests/app/service/test_loader_doc.py +++ b/tests/app/service/test_loader_doc.py @@ -74,7 +74,7 @@ @pytest.fixture def loader_helper(): - return LoaderHelper(app_details, data=data, load_id=data.get("load_id")) + return LoaderHelper(app_details, data=data, load_id=data.get("load_id"), classifier_mode="all") @pytest.fixture From 8ade07374eed69fe61cc2df29dc0e1cfe8953c4a Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Thu, 12 Sep 2024 12:25:12 +0530 Subject: [PATCH 2/3] Fixing lint --- tests/app/service/test_loader_doc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/app/service/test_loader_doc.py b/tests/app/service/test_loader_doc.py index 92acd35f..54eac7fc 100644 --- a/tests/app/service/test_loader_doc.py +++ b/tests/app/service/test_loader_doc.py @@ -74,7 +74,9 @@ @pytest.fixture def loader_helper(): - return LoaderHelper(app_details, data=data, load_id=data.get("load_id"), classifier_mode="all") + return LoaderHelper( + app_details, data=data, load_id=data.get("load_id"), classifier_mode="all" + ) @pytest.fixture From c343bc6be21262f1c3c17e78f065b52827dce2b7 Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Thu, 12 Sep 2024 12:52:10 +0530 Subject: [PATCH 3/3] Fixing lint --- pebblo/app/service/service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pebblo/app/service/service.py b/pebblo/app/service/service.py index b28aba80..e8ce7912 100644 --- a/pebblo/app/service/service.py +++ b/pebblo/app/service/service.py @@ -5,9 +5,9 @@ import hashlib from datetime import datetime -from pebblo.app.config.config import var_server_config_dict from pydantic import ValidationError +from pebblo.app.config.config import var_server_config_dict from pebblo.app.enums.common import ClassificationMode from pebblo.app.enums.enums import CacheDir from pebblo.app.libs.responses import PebbloJsonResponse @@ -172,7 +172,9 @@ def process_request(self, data): self._upsert_loader_details(app_details) # process input docs, app details, and generate final report - loader_helper_obj = LoaderHelper(app_details, self.data, load_id, self.classifier_mode) + loader_helper_obj = LoaderHelper( + app_details, self.data, load_id, self.classifier_mode + ) ( app_details, final_report,