From 2a5cf3dd07e6c085d668a414548bc4e520e69fb6 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Wed, 2 Mar 2022 07:29:48 +0100
Subject: [PATCH] feat(ingest): bigquery - ability to disable partition
 profiling (#4228)

---
 metadata-ingestion/source_docs/bigquery.md    | 10 +++++++---
 .../ingestion/source/ge_data_profiler.py      | 20 ++++++++++++++-----
 .../ingestion/source/ge_profiling_config.py   |  1 +
 .../ingestion/source/sql/sql_common.py        |  9 +++++++++
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/source_docs/bigquery.md b/metadata-ingestion/source_docs/bigquery.md
index fab1a527e64ad..9a180e91f944e 100644
--- a/metadata-ingestion/source_docs/bigquery.md
+++ b/metadata-ingestion/source_docs/bigquery.md
@@ -156,8 +156,12 @@ Note: the bigquery_audit_metadata_datasets parameter receives a list of datasets
 Note: Since bigquery source also supports dataset level lineage, the auth client will require additional permissions to be able to access the google audit logs. Refer the permissions section in bigquery-usage section below which also accesses the audit logs.
 
 ## Profiling
-For profiling you have to set a table schema where Great Expectation (the profiling framework we use) can create temporary
-views by setting `profiling.bigquery_temp_table_schema` property.
+Profiling can profile normal/partitioned and sharded tables as well but due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
+
+If limit/offset parameter is set or partitioning partitioned or sharded table Great Expectation (the profiling framework we use) needs to create temporary
+views. By default these views are created in the schema where the profiled table is but you can control to create all these
+tables into a predefined schema by setting `profiling.bigquery_temp_table_schema` property. 
+Temporary tables are removed after profiling.
 
 ```yaml
      profiling:
@@ -168,7 +172,7 @@ views by setting `profiling.bigquery_temp_table_schema` property.
 :::note
 
 Due to performance reasons, we only profile the latest partition for Partitioned tables and the latest shard for sharded tables.
-
+You can set partition explicitly with `partition.partition_datetime` property if you want. (partition will be applied to all partitioned tables)
 :::
 
 # BigQuery Usage Stats
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
index 5522025832dc0..cac5e7dfe9f30 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py
@@ -793,11 +793,21 @@ def _generate_single_profile(
             **kwargs,
         }
 
-        if self.config.bigquery_temp_table_schema is not None:
-            bigquery_temp_table = (
-                f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
-            )
-            ge_config["bigquery_temp_table"] = bigquery_temp_table
+        # We have to create temporary tables if offset or limit or custom sql is set on Bigquery
+        if custom_sql or self.config.limit or self.config.offset:
+            if self.config.bigquery_temp_table_schema:
+                bigquery_temp_table = (
+                    f"{self.config.bigquery_temp_table_schema}.ge-temp-{uuid.uuid4()}"
+                )
+                ge_config["bigquery_temp_table"] = bigquery_temp_table
+            else:
+                assert table
+                table_parts = table.split(".")
+                if len(table_parts) == 2:
+                    bigquery_temp_table = (
+                        f"{schema}.{table_parts[0]}.ge-temp-{uuid.uuid4()}"
+                    )
+                    ge_config["bigquery_temp_table"] = bigquery_temp_table
 
         if custom_sql is not None:
             ge_config["query"] = custom_sql
diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
index 974d622090da3..eb843587020ac 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py
@@ -43,6 +43,7 @@ class GEProfilingConfig(ConfigModel):
     # Hidden option - used for debugging purposes.
     catch_exceptions: bool = True
 
+    partition_profiling_enabled: bool = True
     bigquery_temp_table_schema: Optional[str] = None
     partition_datetime: Optional[datetime.datetime]
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
index c9f9487a171d3..5d10181ea53cb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py
@@ -1168,6 +1168,15 @@ def loop_profiler_requests(
                 schema, table, self.config.profiling.partition_datetime
             )
 
+            if (
+                partition is not None
+                and not self.config.profiling.partition_profiling_enabled
+            ):
+                logger.debug(
+                    f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled"
+                )
+                continue
+
             self.report.report_entity_profiled(dataset_name)
             yield GEProfilerRequest(
                 pretty_name=dataset_name,