From 8231b76b2c08c8a7916abefc7e80d75e933430cb Mon Sep 17 00:00:00 2001 From: Victor Liang Date: Thu, 11 Jun 2020 14:25:20 -0700 Subject: [PATCH] add code sample and test for medical number custom detector with hotwords --- dlp/custom_infotype.py | 85 +++++++++++++++++++++++++++++++++++++ dlp/custom_infotype_test.py | 20 +++++++++ 2 files changed, 105 insertions(+) diff --git a/dlp/custom_infotype.py b/dlp/custom_infotype.py index 73cdec396024..37cc963093ef 100644 --- a/dlp/custom_infotype.py +++ b/dlp/custom_infotype.py @@ -143,3 +143,88 @@ def inspect_with_medical_record_number_custom_regex_detector( print("No findings.") # [END dlp_inspect_with_medical_record_number_custom_regex_detector] + + +# [START dlp_inspect_with_medical_record_number_w_custom_hotwords] +def inspect_with_medical_record_number_w_custom_hotwords( + project, + content_string, +): + """Uses the Data Loss Prevention API to analyze string with medical record + number custom regex detector, with custom hotwords rules to boost finding + certainty under some circumstances. + Args: + project: The Google Cloud project id to use as a parent resource. + content_string: The string to inspect. + Returns: + None; the response from the API is printed to the terminal. + """ + + # Import the client library. + import google.cloud.dlp + + # Instantiate a client. + dlp = google.cloud.dlp_v2.DlpServiceClient() + + # Construct a custom regex detector info type called "C_MRN", + # with ###-#-##### pattern, where each # represents a digit from 1 to 9. + # The detector has a detection likelihood of POSSIBLE. + custom_info_types = [ + { + "info_type": {"name": "C_MRN"}, + "regex": {"pattern": "[1-9]{3}-[1-9]{1}-[1-9]{5}"}, + "likelihood": "POSSIBLE", + } + ] + + # Construct a rule set with hotwords "mrn" and "medical", with a likelohood + # boost to VERY_LIKELY when hotwords are present within the 10 character- + # window preceding the PII finding. + hotword_rule = { + "hotword_regex": { + "pattern": "(?i)(mrn|medical)(?-i)" + }, + "likelihood_adjustment": { + "fixed_likelihood": "VERY_LIKELY" + }, + "proximity": { + "window_before": 10 + } + } + + rule_set = [ + { + "info_types": [{"name": "C_MRN"}], + "rules": [{"hotword_rule": hotword_rule}], + } + ] + + # Construct the configuration dictionary with the custom regex info type. + inspect_config = { + "custom_info_types": custom_info_types, + "rule_set": rule_set, + } + + # Construct the `item`. + item = {"value": content_string} + + # Convert the project id into a full resource id. + parent = dlp.project_path(project) + + # Call the API. + response = dlp.inspect_content(parent, inspect_config, item) + + # Print out the results. + if response.result.findings: + for finding in response.result.findings: + try: + if finding.quote: + print(f"Quote: {finding.quote}") + except AttributeError: + pass + print(f"Info type: {finding.info_type.name}") + print(f"Likelihood: {finding.likelihood}") + else: + print("No findings.") + +# [END dlp_inspect_with_medical_record_number_w_custom_hotwords] diff --git a/dlp/custom_infotype_test.py b/dlp/custom_infotype_test.py index d4a0ea669ebe..b44c21b2470e 100644 --- a/dlp/custom_infotype_test.py +++ b/dlp/custom_infotype_test.py @@ -34,3 +34,23 @@ def test_inspect_with_medical_record_number_custom_regex_detector(capsys): out, _ = capsys.readouterr() assert "Info type: C_MRN" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_no_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "just a number 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 3" in out + + +def test_inspect_with_medical_record_number_w_custom_hotwords_has_hotwords( + capsys): + custom_infotype.inspect_with_medical_record_number_w_custom_hotwords( + GCLOUD_PROJECT, "Patients MRN 444-5-22222") + + out, _ = capsys.readouterr() + assert "Info type: C_MRN" in out + assert "Likelihood: 5" in out