From 25bb3b050e1af9d6a7c1ee93248eeab04ce3ce65 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Wed, 21 Dec 2022 05:29:46 +0100 Subject: [PATCH] feat(ingest/bigquery): add option to enable/disable legacy sharded table support (#6822) Co-authored-by: Harshal Sheth Co-authored-by: John Joyce --- docs/how/updating-datahub.md | 20 +++++++++++++++---- metadata-ingestion/setup.py | 2 +- .../ingestion/source/bigquery_v2/bigquery.py | 2 ++ .../source/bigquery_v2/bigquery_audit.py | 6 ++++-- .../source/bigquery_v2/bigquery_config.py | 5 +++++ .../ingestion/source/bigquery_v2/common.py | 2 -- .../src/datahub/ingestion/source/mode.py | 8 ++++---- 7 files changed, 32 insertions(+), 13 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index b1b8fdc7db74e..eddef42275359 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -5,12 +5,23 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## Next ### Breaking Changes + +### Potential Downtime + +### Deprecations + +### Other notable Changes + +## 0.9.4 + +### Breaking Changes + - #6243 apache-ranger authorizer is no longer the core part of DataHub GMS, and it is shifted as plugin. Please refer updated documentation [Configuring Authorization with Apache Ranger](./configuring-authorization-with-apache-ranger.md#configuring-your-datahub-deployment) for configuring `apache-ranger-plugin` in DataHub GMS. - #6243 apache-ranger authorizer as plugin is not supported in DataHub Kubernetes deployment. -- #6243 Authentication and Authorization plugins configuration are removed from [application.yml](../../metadata-service/factories/src/main/resources/application.yml). Refer documentation [Migration Of Plugins From application.yml](../plugins.md#migration-of-plugins-from-applicationyml) for migrating any existing custom plugins. +- #6243 Authentication and Authorization plugins configuration are removed from [application.yml](../../metadata-service/factories/src/main/resources/application.yml). Refer documentation [Migration Of Plugins From application.yml](../plugins.md#migration-of-plugins-from-applicationyml) for migrating any existing custom plugins. - `datahub check graph-consistency` command has been removed. It was a beta API that we had considered but decided there are better solutions for this. So removing this. - `graphql_url` option of `powerbi-report-server` source deprecated as the options is not used. -- #6789 biquery-source: sharded table support changes a bit and it will generate different id as before to make sure it does not clash with non-sharded table names. This means if stateful ingestion is enabled then old sharded tables will be recreated with new id and attached tags/glossary_terms/etc needs to be added again. +- #6789 BigQuery ingestion: If `enable_legacy_sharded_table_support` is set to False, sharded table names will be suffixed with \_yyyymmdd to make sure they don't clash with non-sharded tables. This means if stateful ingestion is enabled then old sharded tables will be recreated with a new id and attached tags/glossary terms/etc will need to be added again. _This behavior is not enabled by default yet, but will be enabled by default in a future release._ ### Potential Downtime @@ -25,7 +36,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ### Breaking Changes -- The beta `datahub check graph-consistency` command has been removed. +- The beta `datahub check graph-consistency` command has been removed. ### Potential Downtime @@ -56,7 +67,8 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## 0.9.1 ### Breaking Changes -- we have promoted `bigqery-beta` to `bigquery`. If you are using `bigquery-beta` then change your recipes to use the type `bigquery` + +- We have promoted `bigquery-beta` to `bigquery`. If you are using `bigquery-beta` then change your recipes to use the type `bigquery`. ### Potential Downtime diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 3c80a41789739..8e987680b85c3 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -361,7 +361,7 @@ def get_long_description(): "types-pkg_resources", "types-six", "types-python-dateutil", - "types-requests", + "types-requests>=2.28.11.6", "types-toml", "types-PyMySQL", "types-PyYAML", diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 5259f99b04d85..6e2b2af350eab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -198,6 +198,8 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = ( self.config.sharded_table_pattern ) + if self.config.enable_legacy_sharded_table_support: + BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "" set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index c52cd45edbaa5..d209c8a783d31 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -8,7 +8,6 @@ from dateutil import parser from datahub.emitter.mce_builder import make_dataset_urn -from datahub.ingestion.source.bigquery_v2.common import BQ_SHARDED_TABLE_SUFFIX from datahub.utilities.parsing_util import ( get_first_missing_key, get_first_missing_key_any, @@ -81,6 +80,7 @@ class BigqueryTableIdentifier: invalid_chars: ClassVar[Set[str]] = {"$", "@"} _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[str] = "((.+)[_$])?(\\d{8})$" _BIGQUERY_WILDCARD_REGEX: ClassVar[str] = "((_(\\d+)?)\\*$)|\\*$" + _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd" @staticmethod def get_table_and_shard(table_name: str) -> Tuple[str, Optional[str]]: @@ -134,7 +134,9 @@ def get_table_name(self) -> str: f"{self.project_id}.{self.dataset}.{self.get_table_display_name()}" ) if self.is_sharded_table(): - table_name = f"{table_name}{BQ_SHARDED_TABLE_SUFFIX}" + table_name = ( + f"{table_name}{BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX}" + ) return table_name def is_sharded_table(self) -> bool: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index b23799b886fde..6319a19bb895b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -103,6 +103,11 @@ class BigQueryV2Config(BigQueryConfig, LineageConfig): description="Convert urns to lowercase.", ) + enable_legacy_sharded_table_support: bool = Field( + default=True, + description="Use the legacy sharded table urn suffix added.", + ) + @root_validator(pre=False) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py index 97f28b5af60c0..4ff509858b87d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/common.py @@ -11,8 +11,6 @@ BQ_EXTERNAL_TABLE_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m5!1m4!4m3!1s{project}!2s{dataset}!3s{table}" BQ_EXTERNAL_DATASET_URL_TEMPLATE = "https://console.cloud.google.com/bigquery?project={project}&ws=!1m4!1m3!3m2!1s{project}!2s{dataset}" -BQ_SHARDED_TABLE_SUFFIX = "_yyyymmdd" - def _make_gcp_logging_client( project_id: Optional[str] = None, extra_client_options: Dict[str, Any] = {} diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index d3b9c816651ff..436dfcc0b6998 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -68,9 +68,9 @@ class ModeConfig(DatasetLineageProviderConfigBase): connect_uri: str = Field( default="https://app.mode.com", description="Mode host URL." ) - token: Optional[str] = Field(default=None, description="Mode user token.") - password: Optional[pydantic.SecretStr] = Field( - default=None, description="Mode password for authentication." + token: str = Field(description="Mode user token.") + password: pydantic.SecretStr = Field( + description="Mode password for authentication." ) workspace: Optional[str] = Field(default=None, description="") default_schema: str = Field( @@ -172,7 +172,7 @@ def __init__(self, ctx: PipelineContext, config: ModeConfig): self.session = requests.session() self.session.auth = HTTPBasicAuth( self.config.token, - self.config.password.get_secret_value() if self.config.password else None, + self.config.password.get_secret_value(), ) self.session.headers.update( {