Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding classifier mode changes for file type storage #540

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pebblo/app/api/req_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class ReqLoaderDoc(BaseModel):
loading_end: bool
source_owner: str
classifier_location: str
classifier_mode: Optional[str] = None


class Context(BaseModel):
Expand Down
50 changes: 30 additions & 20 deletions pebblo/app/service/doc_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os.path
from datetime import datetime

from pebblo.app.enums.common import ClassificationMode
from pebblo.app.enums.enums import CacheDir, ClassifierConstants, ReportConstants
from pebblo.app.models.models import (
AiDataModel,
Expand Down Expand Up @@ -36,11 +37,12 @@ class LoaderHelper:
Class for loader doc related task
"""

def __init__(self, app_details, data, load_id):
def __init__(self, app_details, data, load_id, classifier_mode):
self.app_details = app_details
self.data = data
self.load_id = load_id
self.loader_mapper = {}
self.classifier_mode = classifier_mode
self.entity_classifier_obj = EntityClassifier()

# Initialization
Expand Down Expand Up @@ -186,25 +188,33 @@ def _get_classifier_response(self, doc):
)
try:
if doc_info.data:
topics, topic_count, topic_details = topic_classifier_obj.predict(
doc_info.data
)
(
entities,
entity_count,
anonymized_doc,
entity_details,
) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
doc_info.data,
anonymize_snippets=ClassifierConstants.anonymize_snippets.value,
)
doc_info.topics = topics
doc_info.entities = entities
doc_info.entityDetails = entity_details
doc_info.topicDetails = topic_details
doc_info.topicCount = topic_count
doc_info.entityCount = entity_count
doc_info.data = anonymized_doc
if self.classifier_mode and self.classifier_mode in [
ClassificationMode.ALL.value,
ClassificationMode.TOPIC.value,
]:
topics, topic_count, topic_details = topic_classifier_obj.predict(
doc_info.data
)
doc_info.topics = topics
doc_info.topicDetails = topic_details
doc_info.topicCount = topic_count
if self.classifier_mode and self.classifier_mode in [
ClassificationMode.ALL.value,
ClassificationMode.ENTITY.value,
]:
(
entities,
entity_count,
anonymized_doc,
entity_details,
) = self.entity_classifier_obj.presidio_entity_classifier_and_anonymizer(
doc_info.data,
anonymize_snippets=ClassifierConstants.anonymize_snippets.value,
)
doc_info.entities = entities
doc_info.entityDetails = entity_details
doc_info.entityCount = entity_count
doc_info.data = anonymized_doc
return doc_info
except Exception as e:
logger.error(f"Get Classifier Response Failed, Exception: {e}")
Expand Down
15 changes: 14 additions & 1 deletion pebblo/app/service/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@

from pydantic import ValidationError

from pebblo.app.config.config import var_server_config_dict
from pebblo.app.enums.common import ClassificationMode
from pebblo.app.enums.enums import CacheDir
from pebblo.app.libs.responses import PebbloJsonResponse
from pebblo.app.models.models import LoaderDocResponseModel, LoaderDocs, LoaderMetadata
Expand All @@ -15,6 +17,7 @@
from pebblo.log import get_logger
from pebblo.reports.reports import Reports

config_details = var_server_config_dict.get()
logger = get_logger(__name__)


Expand All @@ -26,6 +29,7 @@ class AppLoaderDoc:
def __init__(self):
self.data = None
self.app_name = None
self.classifier_mode = None

def _initialize_data(self, data):
self.data = data
Expand Down Expand Up @@ -122,6 +126,13 @@ def process_request(self, data):
"""
This process is entrypoint function for loader doc API implementation.
"""
if not data.get("classifier_mode"):
self.classifier_mode = config_details.get("classifier", {}).get(
"mode", ClassificationMode.ALL.value
)
else:
self.classifier_mode = data.get("classifier_mode")

self._initialize_data(data)

try:
Expand Down Expand Up @@ -161,7 +172,9 @@ def process_request(self, data):
self._upsert_loader_details(app_details)

# process input docs, app details, and generate final report
loader_helper_obj = LoaderHelper(app_details, self.data, load_id)
loader_helper_obj = LoaderHelper(
app_details, self.data, load_id, self.classifier_mode
)
(
app_details,
final_report,
Expand Down
4 changes: 3 additions & 1 deletion tests/app/service/test_loader_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,9 @@

@pytest.fixture
def loader_helper():
return LoaderHelper(app_details, data=data, load_id=data.get("load_id"))
return LoaderHelper(
app_details, data=data, load_id=data.get("load_id"), classifier_mode="all"
)


@pytest.fixture
Expand Down