diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 13a9fc66dd797..873acea420645 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -1,3 +1,4 @@ +import contextlib import logging import re from typing import Any, Dict, List, Match, Optional, Union @@ -11,6 +12,29 @@ OwnershipTypeClass, ) +logger = logging.getLogger(__name__) + + +def _get_best_match(the_match: Match, group_name: str) -> str: + with contextlib.suppress(IndexError): + return the_match.group(group_name) + + with contextlib.suppress(IndexError): + return the_match.group(1) + + return the_match.group(0) + + +_match_regexp = re.compile(r"{{\s*\$match\s*}}", flags=re.MULTILINE) + + +def _insert_match_value(original_value: str, match_value: str) -> str: + """ + If the original value is something like "foo{{ $match }}bar", then we insert the match value + e.g. "foobar". Otherwise, it will leave the original value unchanged. + """ + return _match_regexp.sub(match_value, original_value) + class Constants: ADD_TAG_OPERATION = "add_tag" @@ -60,7 +84,6 @@ class OperationProcessor: """ operation_defs: Dict[str, Dict] = {} - logger = logging.getLogger(__name__) tag_prefix: str = "" def __init__( @@ -128,7 +151,7 @@ def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: aspect_map = self.convert_to_aspects(operations_map) except Exception as e: - self.logger.error("Error while processing operation defs over raw_props", e) + logger.error(f"Error while processing operation defs over raw_props: {e}") return aspect_map def convert_to_aspects( @@ -171,30 +194,12 @@ def get_operation_value( operation_config: Dict, match: Match, ) -> Optional[Union[str, Dict, List[str]]]: - def _get_best_match(the_match: Match, group_name: str) -> str: - result = the_match.group(0) - try: - result = the_match.group(group_name) - return result - except IndexError: - pass - try: - result = the_match.group(1) - return result - except IndexError: - pass - return result - - match_regexp = r"{{\s*\$match\s*}}" - if ( operation_type == Constants.ADD_TAG_OPERATION and operation_config[Constants.TAG] ): tag = operation_config[Constants.TAG] - tag_id = _get_best_match(match, "tag") - if isinstance(tag_id, str): - tag = re.sub(match_regexp, tag_id, tag, 0, re.MULTILINE) + tag = _insert_match_value(tag, _get_best_match(match, "tag")) if self.tag_prefix: tag = self.tag_prefix + tag @@ -226,9 +231,7 @@ def _get_best_match(the_match: Match, group_name: str) -> str: and operation_config[Constants.TERM] ): term = operation_config[Constants.TERM] - captured_term_id = _get_best_match(match, "term") - if isinstance(captured_term_id, str): - term = re.sub(match_regexp, captured_term_id, term, 0, re.MULTILINE) + term = _insert_match_value(term, _get_best_match(match, "term")) return mce_builder.make_term_urn(term) elif operation_type == Constants.ADD_TERMS_OPERATION: separator = operation_config.get(Constants.SEPARATOR, ",") diff --git a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json index 8b84662dc2acc..c82a8750713ed 100644 --- a/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json +++ b/metadata-ingestion/tests/integration/dbt/dbt_test_column_meta_mapping_golden.json @@ -603,6 +603,9 @@ { "urn": "urn:li:glossaryTerm:customer_id" }, + { + "urn": "urn:li:glossaryTerm:maturity_beta" + }, { "urn": "urn:li:glossaryTerm:pii" } diff --git a/metadata-ingestion/tests/integration/dbt/sample_dbt_manifest.json b/metadata-ingestion/tests/integration/dbt/sample_dbt_manifest.json index 1c7d693d707c1..94ee749d48e0d 100644 --- a/metadata-ingestion/tests/integration/dbt/sample_dbt_manifest.json +++ b/metadata-ingestion/tests/integration/dbt/sample_dbt_manifest.json @@ -7573,9 +7573,9 @@ "depends_on": { "macros": [], "nodes": [ + "source.sample_dbt.pagila.customer", "source.sample_dbt.pagila.city", "source.sample_dbt.pagila.address", - "source.sample_dbt.pagila.customer", "snapshot.sample_dbt.customer_snapshot" ] }, @@ -7612,15 +7612,15 @@ "sources": [ [ "pagila", - "city" + "customer" ], [ "pagila", - "address" + "city" ], [ "pagila", - "customer" + "address" ] ], "tags": [], @@ -7786,12 +7786,12 @@ "depends_on": { "macros": [], "nodes": [ - "source.sample_dbt.pagila.payment_p2020_01", - "source.sample_dbt.pagila.payment_p2020_06", "source.sample_dbt.pagila.payment_p2020_03", + "source.sample_dbt.pagila.payment_p2020_06", + "source.sample_dbt.pagila.payment_p2020_01", "source.sample_dbt.pagila.payment_p2020_04", - "source.sample_dbt.pagila.payment_p2020_05", - "source.sample_dbt.pagila.payment_p2020_02" + "source.sample_dbt.pagila.payment_p2020_02", + "source.sample_dbt.pagila.payment_p2020_05" ] }, "description": "", @@ -7823,7 +7823,7 @@ "sources": [ [ "pagila", - "payment_p2020_01" + "payment_p2020_03" ], [ "pagila", @@ -7831,7 +7831,7 @@ ], [ "pagila", - "payment_p2020_03" + "payment_p2020_01" ], [ "pagila", @@ -7839,11 +7839,11 @@ ], [ "pagila", - "payment_p2020_05" + "payment_p2020_02" ], [ "pagila", - "payment_p2020_02" + "payment_p2020_05" ] ], "tags": [], @@ -7882,6 +7882,7 @@ "description": "description for customer_id from dbt", "meta": { "is_sensitive": true, + "maturity": "beta", "terms": "pii, customer_id" }, "name": "customer_id", diff --git a/metadata-ingestion/tests/integration/dbt/test_dbt.py b/metadata-ingestion/tests/integration/dbt/test_dbt.py index a60f4da234d49..2aece4c23cd61 100644 --- a/metadata-ingestion/tests/integration/dbt/test_dbt.py +++ b/metadata-ingestion/tests/integration/dbt/test_dbt.py @@ -176,6 +176,11 @@ def set_paths( "operation": "add_tag", "config": {"tag": "sensitive"}, }, + "maturity": { + "match": ".*", + "operation": "add_term", + "config": {"term": "maturity_{{ $match }}"}, + }, }, "entities_enabled": { "test_definitions": "NO",