From 2a5cf3dd07e6c085d668a414548bc4e520e69fb6 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Wed, 2 Mar 2022 07:29:48 +0100 Subject: [PATCH] feat(ingest): bigquery - ability to disable partition profiling (#4228) --- metadata-ingestion/source_docs/bigquery.md | 10 +++++++--- .../ingestion/source/ge_data_profiler.py | 20 ++++++++++++++----- .../ingestion/source/ge_profiling_config.py | 1 + .../ingestion/source/sql/sql_common.py | 9 +++++++++ 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md index fab1a527e64ad..9a180e91f944e 100644 --- a/metadata-ingestion/source_docs/bigquery.md +++ b/metadata-ingestion/source_docs/bigquery.md @@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs. ## Profiling -For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary -views by setting `profiling.bigquery_temp_table_schema` property. +Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables. + +If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary +views. By default these views are created in the schema where the profiled table is but you can control to create all these +tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. +Temporary tables are removed after profiling. ```yaml profiling: @@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property. :::note Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables. - +You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables) ::: # BigQuery Usage Stats diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 5522025832dc0..cac5e7dfe9f30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -793,11 +793,21 @@ def _generate_single_profile( **kwargs, } - if self.config.bigquery_temp_table_schema is not None: - bigquery_temp_table = ( - f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}" - ) - ge_config["bigquery_temp_table"] = bigquery_temp_table + # We have to create temporary tables if offset or limit or custom sql is set on Bigquery + if custom_sql or self.config.limit or self.config.offset: + if self.config.bigquery_temp_table_schema: + bigquery_temp_table = ( + f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}" + ) + ge_config["bigquery_temp_table"] = bigquery_temp_table + else: + assert table + table_parts = table.split(".") + if len(table_parts) == 2: + bigquery_temp_table = ( + f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}" + ) + ge_config["bigquery_temp_table"] = bigquery_temp_table if custom_sql is not None: ge_config["query"] = custom_sql diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 974d622090da3..eb843587020ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel): # Hidden option - used for debugging purposes. catch_exceptions: bool = True + partition_profiling_enabled: bool = True bigquery_temp_table_schema: Optional[str] = None partition_datetime: Optional[datetime.datetime] diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index c9f9487a171d3..5d10181ea53cb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -1168,6 +1168,15 @@ def loop_profiler_requests( schema, table, self.config.profiling.partition_datetime ) + if ( + partition is not None + and not self.config.profiling.partition_profiling_enabled + ): + logger.debug( + f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" + ) + continue + self.report.report_entity_profiled(dataset_name) yield GEProfilerRequest( pretty_name=dataset_name,