From 38d31f2e13070c1a2d4a7c6a4101a697a6d9cbc9 Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Tue, 17 Sep 2024 12:30:41 +0530 Subject: [PATCH 1/3] New field added client-secret for Azure client secret ID. --- pebblo/entity_classifier/utils/config.py | 3 + .../entity_classifier/utils/regex_pattern.py | 1 + tests/entity_classifier/mock_response.py | 37 +- tests/entity_classifier/test_data.py | 31 +- .../test_entity_classifier.py | 369 ++++++------------ 5 files changed, 167 insertions(+), 274 deletions(-) diff --git a/pebblo/entity_classifier/utils/config.py b/pebblo/entity_classifier/utils/config.py index 016bd83c..e303a4ab 100644 --- a/pebblo/entity_classifier/utils/config.py +++ b/pebblo/entity_classifier/utils/config.py @@ -13,6 +13,7 @@ "azure-key-id": ["azure_key", "azure_key_id", "azure_id", "key"], "azure-client-secret": ["azure_client_secret", "client", "secret"], "google-api-key": ["google_api_key", "google_key", "google"], + "client-secret": ["azure_client_secret", "client-secret", "client", "secret"], } @@ -46,6 +47,7 @@ class SecretEntities(Enum): AWS_SECRET_KEY = "aws-secret-key" AZURE_KEY_ID = "azure-key-id" AZURE_CLIENT_SECRET = "azure-client-secret" + CLIENT_SECRET = "client-secret" GOOGLE_API_KEY = "google-api-key" GITHUB_FINEGRAINED_TOKEN = "github-finergrained-token" @@ -76,6 +78,7 @@ class PIIGroups(Enum): SecretEntities.AZURE_CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value), SecretEntities.GOOGLE_API_KEY.value: (0.4, PIIGroups.Secrets.value), SecretEntities.GITHUB_FINEGRAINED_TOKEN.value: (0.4, PIIGroups.Secrets.value), + SecretEntities.CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value), # Private keys Entities.PRIVATE_KEY.value: (0.4, PIIGroups.Secrets.value), Entities.DSA_PRIVATE_KEY.value: (0.4, PIIGroups.Secrets.value), diff --git a/pebblo/entity_classifier/utils/regex_pattern.py b/pebblo/entity_classifier/utils/regex_pattern.py index af3628b1..19849aa7 100644 --- a/pebblo/entity_classifier/utils/regex_pattern.py +++ b/pebblo/entity_classifier/utils/regex_pattern.py @@ -13,5 +13,6 @@ "aws-secret-key": r"""\b([A-Za-z0-9+/]{40})[ \r\n'"\x60]""", "azure-key-id": r"""(?i)(%s).{0,20}([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})""", "azure-client-secret": r"""\b(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})\b""", + "client-secret": r"""\b[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\b""", "google-api-key": r"""\bAIza[0-9A-Za-z\-_]{35}\b""", } diff --git a/tests/entity_classifier/mock_response.py b/tests/entity_classifier/mock_response.py index 266879de..dda6e535 100644 --- a/tests/entity_classifier/mock_response.py +++ b/tests/entity_classifier/mock_response.py @@ -1,28 +1,29 @@ mock_input_text1_anonymize_snippet_true = """ -<PERSON>'s SSN is <US_SSN>. +Sachin's SSN is <US_SSN>. ITIN number <US_ITIN> His AWS Access Key is: <AWS_ACCESS_KEY>. -And <PERSON> is: <GITHUB_TOKEN> +And Github Token is: <GITHUB_TOKEN> """ mock_input_text2_anonymize_snippet_true = """ Content -"<PERSON> board on <DATE_TIME> announced an interim dividend of Re 1 per equity share of the face value of Rs 2 each, i.e., a 50 per cent payout for <DATE_TIME> along with financial results for the <DATE_TIME> period of the company for <DATE_TIME>." -"<PERSON> reminded the board of the scheduled retreat coming up in <DATE_TIME>, and provided a drafted retreat schedule. The board provided feedback on the agenda and the consensus was that, outside of making a few minor changes, the committee should move forward as planned. No board action required." +"Wipros board on Friday, January 12 announced an interim dividend of Re 1 per equity share of the face value of Rs 2 each, i.e., a 50 per cent payout for the current financial year along with financial results for the October-December period of the company for the financial year ending March 2024." +"Roberts reminded the board of the scheduled retreat coming up in three months, and provided a drafted retreat schedule. The board provided feedback on the agenda and the consensus was that, outside of making a few minor changes, the committee should move forward as planned. No board action required." "Claims: An adaptive pacing system for implantable cardiac devices, comprising a pulse generator, multiple sensing electrodes, a microprocessor-based control unit, a wireless communication module, and memory for dynamically adjusting pacing parameters based on real-time physiological data. The system of claim 1, wherein the adaptive pacing algorithms include rate-responsive pacing based on physical activity. The system of claim 1, further comprising an external monitoring system for remote data access and modification of pacing parameters." -"<PERSON>'s SSN is <US_SSN>. His passport ID is 5484880UA. -<PERSON>'s driver's license number is <NRP>. -<PERSON>'s bank account number is 70048841700216300. -His <NRP> express credit card number is <CREDIT_CARD>. -His UK IBAN Code is <IBAN_CODE>. -ITIN number <US_ITIN>. -Azure client secret : c4cb6f91-15a7-4e6d-a824-abcdef012345. -AWS Access Key is: <AWS_ACCESS_KEY> -AWS Secret Key is : <AWS_SECRET_KEY> -Github Token is: <GITHUB_TOKEN> -Google API key: <PERSON><PERSON> is: <SLACK_TOKEN> -Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345 -<PERSON> - <SLACK_TOKEN> +"Sachin's SSN is <US_SSN>. His passport ID is 5484880UA. +Sachin's driver's license number is <US_DRIVER_LICENSE>. +Sachin's bank account number is <US_BANK_NUMBER>. +His American express credit card number is <CREDIT_CARD>. +His UK IBAN Code is <IBAN_CODE>. +ITIN number <US_ITIN>. +Azure client secret : <CLIENT_SECRET>. +AWS Access Key is: <AWS_ACCESS_KEY> +AWS Secret Key is : <AWS_SECRET_KEY>Github Token is: <GITHUB_TOKEN> +Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx +Slack Token is: <SLACK_TOKEN> +Azure Client Secret - <CLIENT_SECRET> +Slack Token - <SLACK_TOKEN> Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY" -IP Address - <IP_ADDRESS> +My IP Address - <IP_ADDRESS> +Azure client-secret is <CLIENT_SECRET> """ diff --git a/tests/entity_classifier/test_data.py b/tests/entity_classifier/test_data.py index 1f1fdcd2..5731a6c9 100644 --- a/tests/entity_classifier/test_data.py +++ b/tests/entity_classifier/test_data.py @@ -10,22 +10,23 @@ "Wipros board on Friday, January 12 announced an interim dividend of Re 1 per equity share of the face value of Rs 2 each, i.e., a 50 per cent payout for the current financial year along with financial results for the October-December period of the company for the financial year ending March 2024." "Roberts reminded the board of the scheduled retreat coming up in three months, and provided a drafted retreat schedule. The board provided feedback on the agenda and the consensus was that, outside of making a few minor changes, the committee should move forward as planned. No board action required." "Claims: An adaptive pacing system for implantable cardiac devices, comprising a pulse generator, multiple sensing electrodes, a microprocessor-based control unit, a wireless communication module, and memory for dynamically adjusting pacing parameters based on real-time physiological data. The system of claim 1, wherein the adaptive pacing algorithms include rate-responsive pacing based on physical activity. The system of claim 1, further comprising an external monitoring system for remote data access and modification of pacing parameters." -"Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. -Sachin's driver's license number is S9998888. -Sachin's bank account number is 70048841700216300. -His American express credit card number is 371449635398431. -His UK IBAN Code is AZ96AZEJ00000000001234567890. -ITIN number 993-77 0690. -Azure client secret : c4cb6f91-15a7-4e6d-a824-abcdef012345. -AWS Access Key is: AKIAQIPT4PDORIRTV6PH -AWS Secret Key is : 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 -Github Token is: ghp_hgu657yiujgwfrtigu3ver238765tyuhygvtrder6t7gyvhbuy5e676578976tyghy76578uygfyfgcyturtdf -Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx -Slack Token is: xoxp-7676545380258-uygh -Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345 -Slack Token - xoxb-3204014939555-4519358291237-TTIf0243T8YFSAGEVr1wBrWE +"Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. +Sachin's driver's license number is S9998888. +Sachin's bank account number is 70048841700216300. +His American express credit card number is 371449635398431. +His UK IBAN Code is AZ96AZEJ00000000001234567890. +ITIN number 993-77 0690. +Azure client secret : c4cb6f91-15a7-4e6d-a824-abcdef012345. +AWS Access Key is: AKIAQIPT4PDORIRTV6PH +AWS Secret Key is : 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 +Github Token is: ghp_hgu657yiujgwfrtigu3ver238765tyuhygvtrder6t7gyvhbuy5e676578976tyghy76578uygfyfgcyturtdf +Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx +Slack Token is: xoxp-7676545380258-uygh +Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345 +Slack Token - xoxb-3204014939555-4519358291237-TTIf0243T8YFSAGEVr1wBrWE Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY" -My IP Address - 10.55.60.61 +My IP Address - 10.55.60.61 +Azure client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda """ negative_data = """ diff --git a/tests/entity_classifier/test_entity_classifier.py b/tests/entity_classifier/test_entity_classifier.py index 25181766..d3425eb5 100644 --- a/tests/entity_classifier/test_entity_classifier.py +++ b/tests/entity_classifier/test_entity_classifier.py @@ -1,6 +1,3 @@ -from typing import List, Tuple -from unittest.mock import Mock, patch - import pytest from pebblo.entity_classifier.entity_classifier import EntityClassifier @@ -17,211 +14,23 @@ def __init__(self, entity_type): @pytest.fixture -def mocked_objects(): - with ( - patch( - "pebblo.entity_classifier.entity_classifier.AnalyzerEngine" - ) as mock_analyzer, - patch( - "pebblo.entity_classifier.entity_classifier.AnalyzerEngine" - ) as mock_anomyzer, - patch( - "pebblo.entity_classifier.utils.utils.add_custom_regex_analyzer_registry" - ) as mock_custom_registry, - ): - yield mock_analyzer, mock_anomyzer, mock_custom_registry - - -@pytest.fixture -def mocked_entity_classifier_response(mocker): - """ - Mocking entity classifier response - """ - mocker.patch( - "pebblo.entity_classifier.entity_classifier.EntityClassifier.analyze_response", - return_value=Mock(), - ) - - anonymize_response1: Tuple[list, str] = ( - [ - TestAnonymizerResult("GITHUB_TOKEN"), - TestAnonymizerResult("AWS_ACCESS_KEY"), - TestAnonymizerResult("US_ITIN"), - TestAnonymizerResult("US_SSN"), - ], - mock_input_text1_anonymize_snippet_true, - ) - anonymize_response2: Tuple[list, str] = ( - [ - TestAnonymizerResult("SLACK_TOKEN"), - TestAnonymizerResult("SLACK_TOKEN"), - TestAnonymizerResult("GITHUB_TOKEN"), - TestAnonymizerResult("AWS_SECRET_KEY"), - TestAnonymizerResult("AWS_ACCESS_KEY"), - TestAnonymizerResult("US_ITIN"), - TestAnonymizerResult("IBAN_CODE"), - TestAnonymizerResult("CREDIT_CARD"), - TestAnonymizerResult("US_SSN"), - TestAnonymizerResult("IP_ADDRESS"), - ], - mock_input_text2_anonymize_snippet_true, - ) - anonymize_negative_response: Tuple[list, str] = ( - [], - negative_data, - ) - mocker.patch( - "pebblo.entity_classifier.entity_classifier.EntityClassifier.anonymize_response", - side_effect=[ - anonymize_response1, - anonymize_response2, - anonymize_negative_response, - ], - ) - - analyzed_entities_response1: List[dict] = [ - {"entity_type": "US_SSN", "location": "17_28", "confidence_score": 0.85}, - {"entity_type": "US_ITIN", "location": "42_53", "confidence_score": 0.85}, - { - "entity_type": "AWS_ACCESS_KEY", - "location": "77_97", - "confidence_score": 0.8, - }, - { - "entity_type": "GITHUB_TOKEN", - "location": "120_210", - "confidence_score": 0.8, - }, - ] - analyzed_entities_response2: List[dict] = [ - {"entity_type": "US_SSN", "location": "17_25", "confidence_score": 0.85}, - {"entity_type": "US_ITIN", "location": "39_48", "confidence_score": 0.85}, - { - "entity_type": "AWS_ACCESS_KEY", - "location": "72_88", - "confidence_score": 0.8, - }, - { - "entity_type": "GITHUB_TOKEN", - "location": "111_125", - "confidence_score": 0.8, - }, - ] - analyzed_entities_response3: List[dict] = [ - { - "entity_type": "CREDIT_CARD", - "location": "1367_1382", - "confidence_score": 1.0, - }, - { - "entity_type": "IBAN_CODE", - "location": "1406_1434", - "confidence_score": 1.0, - }, - {"entity_type": "US_SSN", "location": "1178_1189", "confidence_score": 0.85}, - {"entity_type": "US_ITIN", "location": "1450_1461", "confidence_score": 0.85}, - { - "entity_type": "AWS_ACCESS_KEY", - "location": "1545_1565", - "confidence_score": 0.8, - }, - { - "entity_type": "AWS_SECRET_KEY", - "location": "1587_1628", - "confidence_score": 0.8, - }, - { - "entity_type": "GITHUB_TOKEN", - "location": "1646_1736", - "confidence_score": 0.8, - }, - { - "entity_type": "SLACK_TOKEN", - "location": "1812_1835", - "confidence_score": 0.8, - }, - { - "entity_type": "SLACK_TOKEN", - "location": "1911_1968", - "confidence_score": 0.8, - }, - {"entity_type": "IP_ADDRESS", "location": "1339_1355", "confidence_score": 0.8}, - ] - analyzed_entities_response4: List[dict] = [ - { - "entity_type": "CREDIT_CARD", - "location": "1178_1186", - "confidence_score": 1.0, - }, - { - "entity_type": "IBAN_CODE", - "location": "1364_1377", - "confidence_score": 1.0, - }, - {"entity_type": "US_SSN", "location": "1401_1412", "confidence_score": 0.85}, - {"entity_type": "US_ITIN", "location": "1428_1437", "confidence_score": 0.85}, - { - "entity_type": "AWS_ACCESS_KEY", - "location": "1521_1537", - "confidence_score": 0.8, - }, - { - "entity_type": "AWS_SECRET_KEY", - "location": "1559_1575", - "confidence_score": 0.8, - }, - { - "entity_type": "GITHUB_TOKEN", - "location": "1593_1607", - "confidence_score": 0.8, - }, - { - "entity_type": "SLACK_TOKEN", - "location": "1683_1696", - "confidence_score": 0.8, - }, - { - "entity_type": "SLACK_TOKEN", - "location": "1772_1785", - "confidence_score": 0.8, - }, - {"entity_type": "IP_ADDRESS", "location": "1339_1355", "confidence_score": 0.8}, - ] - analyzed_entities_negative_response1: List = [] - analyzed_entities_negative_response2: List = [] - mocker.patch( - "pebblo.entity_classifier.entity_classifier.EntityClassifier.get_analyzed_entities_response", - side_effect=[ - analyzed_entities_response1, - analyzed_entities_response2, - analyzed_entities_response3, - analyzed_entities_response4, - analyzed_entities_negative_response1, - analyzed_entities_negative_response2, - ], - ) - - -@pytest.fixture -def entity_classifier(mocked_objects): +def entity_classifier(): """ Create an instance of the EntityClassifier class """ return EntityClassifier() -def test_entity_classifier_init(mocked_objects) -> None: +def test_entity_classifier_init() -> None: """ Initiated Entity Classifier """ _ = EntityClassifier() -def test_presidio_entity_classifier_and_anonymizer( - entity_classifier, mocked_entity_classifier_response -): +def test_entity_classifier_and_anonymizer1(entity_classifier): """ - UTs for presidio_entity_classifier_and_anonymizer function + UT for presidio_entity_classifier_and_anonymizer function with input_text1 """ ( entities, @@ -285,34 +94,39 @@ def test_presidio_entity_classifier_and_anonymizer( assert entity_details == { "us-ssn": [ { - "location": "17_25", + "location": "17_31", "confidence_score": "HIGH", "entity_group": "pii-identification", } ], "us-itin": [ { - "location": "39_48", + "location": "45_60", "confidence_score": "HIGH", "entity_group": "pii-financial", } ], "aws-access-key": [ { - "location": "72_88", + "location": "84_106", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "111_125", + "location": "129_149", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], } + +def test_entity_classifier_and_anonymizer2(entity_classifier): + """ + UT for presidio_entity_classifier_and_anonymizer function with input_text2 + """ ( entities, total_count, @@ -320,83 +134,117 @@ def test_presidio_entity_classifier_and_anonymizer( entity_details, ) = entity_classifier.presidio_entity_classifier_and_anonymizer(input_text2) assert entities == { - "slack-token": 2, - "github-token": 1, + "us-ssn": 1, + "us-drivers-license": 1, + "us-bank-account-number": 1, + "credit-card-number": 1, + "iban-code": 1, + "us-itin": 1, + "client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, - "us-itin": 1, - "iban-code": 1, - "credit-card-number": 1, - "us-ssn": 1, + "github-token": 1, + "slack-token": 2, "ip-address": 1, } - assert total_count == 10 + assert total_count == 15 assert anonymized_text == input_text2 assert entity_details == { - "credit-card-number": [ + "us-ssn": [ { - "location": "1367_1382", + "location": "1178_1189", "confidence_score": "HIGH", + "entity_group": "pii-identification", + } + ], + "us-drivers-license": [ + { + "location": "1257_1265", + "confidence_score": "MEDIUM", + "entity_group": "pii-identification", + } + ], + "us-bank-account-number": [ + { + "location": "1299_1316", + "confidence_score": "MEDIUM", "entity_group": "pii-financial", } ], - "iban-code": [ + "credit-card-number": [ { - "location": "1406_1434", + "location": "1361_1376", "confidence_score": "HIGH", "entity_group": "pii-financial", } ], - "us-ssn": [ + "iban-code": [ { - "location": "1178_1189", + "location": "1398_1426", "confidence_score": "HIGH", - "entity_group": "pii-identification", + "entity_group": "pii-financial", } ], "us-itin": [ { - "location": "1450_1461", + "location": "1440_1451", "confidence_score": "HIGH", "entity_group": "pii-financial", } ], + "client-secret": [ + { + "location": "1475_1511", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + { + "location": "1841_1877", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + { + "location": "2058_2094", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + ], "aws-access-key": [ { - "location": "1545_1565", + "location": "1532_1552", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "aws-secret-key": [ { - "location": "1587_1628", + "location": "1573_1614", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "1646_1736", + "location": "1631_1721", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "slack-token": [ { - "location": "1812_1835", + "location": "1795_1818", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1911_1968", + "location": "1892_1949", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "ip-address": [ { - "location": "1339_1355", + "location": "2023_2034", "confidence_score": "HIGH", "entity_group": "pii-network", } @@ -412,89 +260,128 @@ def test_presidio_entity_classifier_and_anonymizer( input_text2, anonymize_snippets=True ) assert entities == { - "slack-token": 2, - "github-token": 1, + "us-ssn": 1, + "us-drivers-license": 1, + "us-bank-account-number": 1, + "credit-card-number": 1, + "iban-code": 1, + "us-itin": 1, + "client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, - "us-itin": 1, - "iban-code": 1, - "credit-card-number": 1, - "us-ssn": 1, + "github-token": 1, + "slack-token": 2, "ip-address": 1, } - assert total_count == 10 + assert total_count == 15 assert anonymized_text == mock_input_text2_anonymize_snippet_true assert entity_details == { - "credit-card-number": [ + "us-ssn": [ { - "location": "1178_1186", + "location": "1178_1192", "confidence_score": "HIGH", + "entity_group": "pii-identification", + } + ], + "us-drivers-license": [ + { + "location": "1260_1285", + "confidence_score": "MEDIUM", + "entity_group": "pii-identification", + } + ], + "us-bank-account-number": [ + { + "location": "1319_1341", + "confidence_score": "MEDIUM", "entity_group": "pii-financial", } ], - "iban-code": [ + "credit-card-number": [ { - "location": "1364_1377", + "location": "1386_1405", "confidence_score": "HIGH", "entity_group": "pii-financial", } ], - "us-ssn": [ + "iban-code": [ { - "location": "1401_1412", + "location": "1427_1444", "confidence_score": "HIGH", - "entity_group": "pii-identification", + "entity_group": "pii-financial", } ], "us-itin": [ { - "location": "1428_1437", + "location": "1458_1473", "confidence_score": "HIGH", "entity_group": "pii-financial", } ], + "client-secret": [ + { + "location": "1497_1518", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + { + "location": "1757_1778", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + { + "location": "1928_1949", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + }, + ], "aws-access-key": [ { - "location": "1521_1537", + "location": "1539_1561", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "aws-secret-key": [ { - "location": "1559_1575", + "location": "1582_1604", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "1593_1607", + "location": "1621_1641", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "slack-token": [ { - "location": "1683_1696", + "location": "1715_1734", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1772_1785", + "location": "1793_1812", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "ip-address": [ { - "location": "1339_1355", + "location": "1886_1904", "confidence_score": "HIGH", "entity_group": "pii-network", } ], } + +def test_entity_classifier_and_anonymizer_negative_data(entity_classifier): + """ + UT for presidio_entity_classifier_and_anonymizer function with negative_data + """ ( entities, total_count, From d1632a5e99ddb7b5b0a8f3e2213abf263df65c09 Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Wed, 18 Sep 2024 10:09:49 +0530 Subject: [PATCH 2/3] Updating client-secret to axure-client-secret --- docs/gh_pages/docs/entityclassifier.md | 1 + pebblo/entity_classifier/README.md | 1 + pebblo/entity_classifier/utils/config.py | 5 +--- .../entity_classifier/utils/regex_pattern.py | 3 +-- tests/entity_classifier/mock_response.py | 6 ++--- .../test_entity_classifier.py | 26 +++++++++---------- 6 files changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/gh_pages/docs/entityclassifier.md b/docs/gh_pages/docs/entityclassifier.md index 612b69c3..c21a9510 100644 --- a/docs/gh_pages/docs/entityclassifier.md +++ b/docs/gh_pages/docs/entityclassifier.md @@ -31,6 +31,7 @@ Below is the list of `entities` supported by Pebblo - 1. RSA Private Key 1. Google Account Private Key 1. Github Fine Grained Token +1. Azure Client Secret Key User can get details of classified entities for their loader source files in Pebblo report. diff --git a/pebblo/entity_classifier/README.md b/pebblo/entity_classifier/README.md index ab180196..6d2965cf 100644 --- a/pebblo/entity_classifier/README.md +++ b/pebblo/entity_classifier/README.md @@ -25,6 +25,7 @@ And following Secret Entities: 10. RSA Private Key 11. Google Account Private Key 12. Github Fine Grained Token +13. Azure Client Secret Key ## How to use Entity Classifier diff --git a/pebblo/entity_classifier/utils/config.py b/pebblo/entity_classifier/utils/config.py index e303a4ab..1c0f94b2 100644 --- a/pebblo/entity_classifier/utils/config.py +++ b/pebblo/entity_classifier/utils/config.py @@ -11,9 +11,8 @@ "aws-access-key": ["aws_access_key", "aws_key", "access", "id", "api"], "aws-secret-key": ["aws_secret_key", "secret"], "azure-key-id": ["azure_key", "azure_key_id", "azure_id", "key"], - "azure-client-secret": ["azure_client_secret", "client", "secret"], + "azure-client-secret": ["azure_client_secret", "client", "secret", "client-secret"], "google-api-key": ["google_api_key", "google_key", "google"], - "client-secret": ["azure_client_secret", "client-secret", "client", "secret"], } @@ -47,7 +46,6 @@ class SecretEntities(Enum): AWS_SECRET_KEY = "aws-secret-key" AZURE_KEY_ID = "azure-key-id" AZURE_CLIENT_SECRET = "azure-client-secret" - CLIENT_SECRET = "client-secret" GOOGLE_API_KEY = "google-api-key" GITHUB_FINEGRAINED_TOKEN = "github-finergrained-token" @@ -78,7 +76,6 @@ class PIIGroups(Enum): SecretEntities.AZURE_CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value), SecretEntities.GOOGLE_API_KEY.value: (0.4, PIIGroups.Secrets.value), SecretEntities.GITHUB_FINEGRAINED_TOKEN.value: (0.4, PIIGroups.Secrets.value), - SecretEntities.CLIENT_SECRET.value: (0.8, PIIGroups.Secrets.value), # Private keys Entities.PRIVATE_KEY.value: (0.4, PIIGroups.Secrets.value), Entities.DSA_PRIVATE_KEY.value: (0.4, PIIGroups.Secrets.value), diff --git a/pebblo/entity_classifier/utils/regex_pattern.py b/pebblo/entity_classifier/utils/regex_pattern.py index 19849aa7..51ecbf63 100644 --- a/pebblo/entity_classifier/utils/regex_pattern.py +++ b/pebblo/entity_classifier/utils/regex_pattern.py @@ -12,7 +12,6 @@ "aws-access-key": r"""\b((?:AKIA|ABIA|ACCA|ASIA)[0-9A-Z]{16})\b""", "aws-secret-key": r"""\b([A-Za-z0-9+/]{40})[ \r\n'"\x60]""", "azure-key-id": r"""(?i)(%s).{0,20}([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})""", - "azure-client-secret": r"""\b(?i)(%s).{0,20}([a-z0-9_\.\-~]{34})\b""", - "client-secret": r"""\b[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\b""", + "azure-client-secret": r"""\b[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}\b""", "google-api-key": r"""\bAIza[0-9A-Za-z\-_]{35}\b""", } diff --git a/tests/entity_classifier/mock_response.py b/tests/entity_classifier/mock_response.py index dda6e535..4e1c7744 100644 --- a/tests/entity_classifier/mock_response.py +++ b/tests/entity_classifier/mock_response.py @@ -16,14 +16,14 @@ His American express credit card number is <CREDIT_CARD>. His UK IBAN Code is <IBAN_CODE>. ITIN number <US_ITIN>. -Azure client secret : <CLIENT_SECRET>. +Azure client secret : <AZURE_CLIENT_SECRET>. AWS Access Key is: <AWS_ACCESS_KEY> AWS Secret Key is : <AWS_SECRET_KEY>Github Token is: <GITHUB_TOKEN> Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx Slack Token is: <SLACK_TOKEN> -Azure Client Secret - <CLIENT_SECRET> +Azure Client Secret - <AZURE_CLIENT_SECRET> Slack Token - <SLACK_TOKEN> Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY" My IP Address - <IP_ADDRESS> -Azure client-secret is <CLIENT_SECRET> +Azure client-secret is <AZURE_CLIENT_SECRET> """ diff --git a/tests/entity_classifier/test_entity_classifier.py b/tests/entity_classifier/test_entity_classifier.py index d3425eb5..86ca0d29 100644 --- a/tests/entity_classifier/test_entity_classifier.py +++ b/tests/entity_classifier/test_entity_classifier.py @@ -140,7 +140,7 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "credit-card-number": 1, "iban-code": 1, "us-itin": 1, - "client-secret": 3, + "azure-client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, "github-token": 1, @@ -192,7 +192,7 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "entity_group": "pii-financial", } ], - "client-secret": [ + "azure-client-secret": [ { "location": "1475_1511", "confidence_score": "HIGH", @@ -266,7 +266,7 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "credit-card-number": 1, "iban-code": 1, "us-itin": 1, - "client-secret": 3, + "azure-client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, "github-token": 1, @@ -318,59 +318,59 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "entity_group": "pii-financial", } ], - "client-secret": [ + "azure-client-secret": [ { - "location": "1497_1518", + "location": "1497_1524", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1757_1778", + "location": "1763_1790", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1928_1949", + "location": "1940_1967", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "aws-access-key": [ { - "location": "1539_1561", + "location": "1545_1567", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "aws-secret-key": [ { - "location": "1582_1604", + "location": "1588_1610", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "1621_1641", + "location": "1627_1647", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "slack-token": [ { - "location": "1715_1734", + "location": "1721_1740", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1793_1812", + "location": "1805_1824", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "ip-address": [ { - "location": "1886_1904", + "location": "1898_1916", "confidence_score": "HIGH", "entity_group": "pii-network", } From cfe749da6ec2e2b2a133caaf8185817ed52cdfec Mon Sep 17 00:00:00 2001 From: Dristy Srivastava Date: Wed, 18 Sep 2024 12:00:13 +0530 Subject: [PATCH 3/3] Updated test cases --- pebblo/entity_classifier/utils/config.py | 2 +- tests/entity_classifier/mock_response.py | 4 +- tests/entity_classifier/test_data.py | 30 +++- .../test_entity_classifier.py | 133 +++++++++++------- 4 files changed, 111 insertions(+), 58 deletions(-) diff --git a/pebblo/entity_classifier/utils/config.py b/pebblo/entity_classifier/utils/config.py index 1c0f94b2..23da3e1a 100644 --- a/pebblo/entity_classifier/utils/config.py +++ b/pebblo/entity_classifier/utils/config.py @@ -11,7 +11,7 @@ "aws-access-key": ["aws_access_key", "aws_key", "access", "id", "api"], "aws-secret-key": ["aws_secret_key", "secret"], "azure-key-id": ["azure_key", "azure_key_id", "azure_id", "key"], - "azure-client-secret": ["azure_client_secret", "client", "secret", "client-secret"], + "azure-client-secret": ["azure_client_secret", "client-secret", "client_secret"], "google-api-key": ["google_api_key", "google_key", "google"], } diff --git a/tests/entity_classifier/mock_response.py b/tests/entity_classifier/mock_response.py index 4e1c7744..28bbf118 100644 --- a/tests/entity_classifier/mock_response.py +++ b/tests/entity_classifier/mock_response.py @@ -16,14 +16,12 @@ His American express credit card number is <CREDIT_CARD>. His UK IBAN Code is <IBAN_CODE>. ITIN number <US_ITIN>. -Azure client secret : <AZURE_CLIENT_SECRET>. AWS Access Key is: <AWS_ACCESS_KEY> AWS Secret Key is : <AWS_SECRET_KEY>Github Token is: <GITHUB_TOKEN> Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx Slack Token is: <SLACK_TOKEN> -Azure Client Secret - <AZURE_CLIENT_SECRET> Slack Token - <SLACK_TOKEN> Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY" My IP Address - <IP_ADDRESS> -Azure client-secret is <AZURE_CLIENT_SECRET> +Azure client_secret is <AZURE_CLIENT_SECRET> """ diff --git a/tests/entity_classifier/test_data.py b/tests/entity_classifier/test_data.py index 5731a6c9..c57004dd 100644 --- a/tests/entity_classifier/test_data.py +++ b/tests/entity_classifier/test_data.py @@ -16,17 +16,15 @@ His American express credit card number is 371449635398431. His UK IBAN Code is AZ96AZEJ00000000001234567890. ITIN number 993-77 0690. -Azure client secret : c4cb6f91-15a7-4e6d-a824-abcdef012345. AWS Access Key is: AKIAQIPT4PDORIRTV6PH AWS Secret Key is : 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 Github Token is: ghp_hgu657yiujgwfrtigu3ver238765tyuhygvtrder6t7gyvhbuy5e676578976tyghy76578uygfyfgcyturtdf Google API key: zaCELgL0imfnc8mVLWwsAawjYr4Rx-Af50DDqtlx Slack Token is: xoxp-7676545380258-uygh -Azure Client Secret - c4cb6f91-15a7-4e6d-a824-abcdef012345 Slack Token - xoxb-3204014939555-4519358291237-TTIf0243T8YFSAGEVr1wBrWE Google API key- KLzaSyB_tWrbmfWx8g2bzL7Vhq7znuTUn0JPKmY" My IP Address - 10.55.60.61 -Azure client-secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda +Azure client_secret is de1d4a2d-d9fa-44f1-84bb-4f73c004afda """ negative_data = """ @@ -34,3 +32,29 @@ His AWS Access Key is: AKIPT4PDORIRTV6PH. And Github Token is: ghpu657yiujgwfrtigu3ver238765tyuhygvtrder6t7gyvhbuy5e676578976tyghy76578uygfyfgcyturtdf """ + +tf_test_data = """ +variable "client_secret" { +} + +# We strongly recommend using the required_providers block to set the +# Azure Provider source and version being used +terraform { + required_providers { + azurerm = { + source = "hashicorp/azurerm" + version = "~> 4.x" + } + } +} + +# Configure the Microsoft Azure Provider +provider "azurerm" { + features {} + + client_id = "00000000-0000-0000-0000-000000000000" + client_secret = "1131a1fc-8cee-4f3c-9b2f-6808f66f72a4" + tenant_id = "10000000-0000-0000-0000-000000000000" + subscription_id = "20000000-0000-0000-0000-000000000000" +} +""" diff --git a/tests/entity_classifier/test_entity_classifier.py b/tests/entity_classifier/test_entity_classifier.py index 86ca0d29..c96d58c0 100644 --- a/tests/entity_classifier/test_entity_classifier.py +++ b/tests/entity_classifier/test_entity_classifier.py @@ -5,7 +5,12 @@ mock_input_text1_anonymize_snippet_true, mock_input_text2_anonymize_snippet_true, ) -from tests.entity_classifier.test_data import input_text1, input_text2, negative_data +from tests.entity_classifier.test_data import ( + input_text1, + input_text2, + negative_data, + tf_test_data, +) class TestAnonymizerResult: @@ -140,14 +145,14 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "credit-card-number": 1, "iban-code": 1, "us-itin": 1, - "azure-client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, "github-token": 1, "slack-token": 2, "ip-address": 1, + "azure-client-secret": 1, } - assert total_count == 15 + assert total_count == 13 assert anonymized_text == input_text2 assert entity_details == { "us-ssn": [ @@ -192,63 +197,53 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "entity_group": "pii-financial", } ], - "azure-client-secret": [ - { - "location": "1475_1511", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - { - "location": "1841_1877", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - { - "location": "2058_2094", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - ], "aws-access-key": [ { - "location": "1532_1552", + "location": "1472_1492", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "aws-secret-key": [ { - "location": "1573_1614", + "location": "1513_1554", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "1631_1721", + "location": "1571_1661", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "slack-token": [ { - "location": "1795_1818", + "location": "1735_1758", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1892_1949", + "location": "1773_1830", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "ip-address": [ { - "location": "2023_2034", + "location": "1904_1915", "confidence_score": "HIGH", "entity_group": "pii-network", } ], + "azure-client-secret": [ + { + "location": "1939_1975", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ], } ( @@ -266,14 +261,14 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "credit-card-number": 1, "iban-code": 1, "us-itin": 1, - "azure-client-secret": 3, "aws-access-key": 1, "aws-secret-key": 1, "github-token": 1, "slack-token": 2, "ip-address": 1, + "azure-client-secret": 1, } - assert total_count == 15 + assert total_count == 13 assert anonymized_text == mock_input_text2_anonymize_snippet_true assert entity_details == { "us-ssn": [ @@ -318,63 +313,53 @@ def test_entity_classifier_and_anonymizer2(entity_classifier): "entity_group": "pii-financial", } ], - "azure-client-secret": [ - { - "location": "1497_1524", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - { - "location": "1763_1790", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - { - "location": "1940_1967", - "confidence_score": "HIGH", - "entity_group": "secrets_and_tokens", - }, - ], "aws-access-key": [ { - "location": "1545_1567", + "location": "1494_1516", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "aws-secret-key": [ { - "location": "1588_1610", + "location": "1537_1559", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "github-token": [ { - "location": "1627_1647", + "location": "1576_1596", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", } ], "slack-token": [ { - "location": "1721_1740", + "location": "1670_1689", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, { - "location": "1805_1824", + "location": "1704_1723", "confidence_score": "HIGH", "entity_group": "secrets_and_tokens", }, ], "ip-address": [ { - "location": "1898_1916", + "location": "1797_1815", "confidence_score": "HIGH", "entity_group": "pii-network", } ], + "azure-client-secret": [ + { + "location": "1839_1866", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ], } @@ -403,3 +388,49 @@ def test_entity_classifier_and_anonymizer_negative_data(entity_classifier): assert entities == {} assert total_count == 0 assert anonymized_text == negative_data + + +def test_entity_classifier_and_anonymizer_azure_secret(entity_classifier): + """ + UT for presidio_entity_classifier_and_anonymizer function with tf_test_data + """ + ( + entities, + total_count, + anonymized_text, + entity_details, + ) = entity_classifier.presidio_entity_classifier_and_anonymizer(tf_test_data) + assert entities == { + "azure-client-secret": 1, + } + assert total_count == 1 + assert anonymized_text == tf_test_data + assert entity_details == { + "azure-client-secret": [ + { + "location": "430_466", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ] + } + + ( + entities, + total_count, + anonymized_text, + entity_details, + ) = entity_classifier.presidio_entity_classifier_and_anonymizer(tf_test_data, True) + assert entities == { + "azure-client-secret": 1, + } + assert total_count == 1 + assert entity_details == { + "azure-client-secret": [ + { + "location": "430_457", + "confidence_score": "HIGH", + "entity_group": "secrets_and_tokens", + } + ] + }