feat(ingest): bigquery - ability to disable partition profiling (data…

…hub-project#4228)
leifker · Mar 2, 2022 · 2a5cf3d · 2a5cf3d
1 parent d52638a
commit 2a5cf3d
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 8 deletions.
diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md
@@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
 Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
 
 ## Profiling
-For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
-views by setting `profiling.bigquery_temp_table_schema` property.
+Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
+
+If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
+views. By default these views are created in the schema where the profiled table is but you can control to create all these
+tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. 
+Temporary tables are removed after profiling.
 
 ```yaml
      profiling:
@@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
 :::note
 
 Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
-
+You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
 :::
 
 # BigQuery Usage Stats

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -793,11 +793,21 @@ def _generate_single_profile(
             **kwargs,
         }
 
-        if self.config.bigquery_temp_table_schema is not None:
-            bigquery_temp_table = (
-                f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
-            )
-            ge_config["bigquery_temp_table"] = bigquery_temp_table
+        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
+        if custom_sql or self.config.limit or self.config.offset:
+            if self.config.bigquery_temp_table_schema:
+                bigquery_temp_table = (
+                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
+                )
+                ge_config["bigquery_temp_table"] = bigquery_temp_table
+            else:
+                assert table
+                table_parts = table.split(".")
+                if len(table_parts) == 2:
+                    bigquery_temp_table = (
+                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
+                    )
+                    ge_config["bigquery_temp_table"] = bigquery_temp_table
 
         if custom_sql is not None:
             ge_config["query"] = custom_sql

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
     # Hidden option - used for debugging purposes.
     catch_exceptions: bool = True
 
+    partition_profiling_enabled: bool = True
     bigquery_temp_table_schema: Optional[str] = None
     partition_datetime: Optional[datetime.datetime]
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -1168,6 +1168,15 @@ def loop_profiler_requests(
                 schema, table, self.config.profiling.partition_datetime
             )
 
+            if (
+                partition is not None
+                and not self.config.profiling.partition_profiling_enabled
+            ):
+                logger.debug(
+                    f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
+                )
+                continue
+
             self.report.report_entity_profiled(dataset_name)
             yield GEProfilerRequest(
                 pretty_name=dataset_name,