diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml index 125ce28952a52..c75f372c49e50 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_recipe.yml @@ -23,10 +23,18 @@ source: extract_endorsements_to_tags: false # dataset_type_mapping is fixed mapping of Power BI datasources type to equivalent Datahub "data platform" dataset dataset_type_mapping: - PostgreSql: postgres - Oracle: oracle - Sql: mssql - GoogleBigQuery: bigquery + PostgreSql: + platform_instance: operational_instance + env: DEV + Oracle: + platform_instance: high_performance_production_unit + env: PROD + Sql: + platform_instance: reporting-db + env: QA + GoogleBigQuery: + platform_instance: sn-2 + env: STAGE sink: # sink configs diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index 84f8176ab2bfa..0a3c8bd5f99c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -1,5 +1,6 @@ import logging from dataclasses import dataclass, field as dataclass_field +from enum import Enum from typing import Dict, List, Optional, Union import pydantic @@ -7,7 +8,7 @@ from pydantic.class_validators import root_validator import datahub.emitter.mce_builder as builder -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -105,6 +106,47 @@ class Constant: DATASET_WEB_URL = "datasetWebUrl" +@dataclass +class DataPlatformPair: + datahub_data_platform_name: str + powerbi_data_platform_name: str + + +@dataclass +class DataPlatformTable: + name: str + full_name: str + data_platform_pair: DataPlatformPair + + +class SupportedDataPlatform(Enum): + POSTGRES_SQL = DataPlatformPair( + powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" + ) + + ORACLE = DataPlatformPair( + powerbi_data_platform_name="Oracle", datahub_data_platform_name="oracle" + ) + + SNOWFLAKE = DataPlatformPair( + powerbi_data_platform_name="Snowflake", datahub_data_platform_name="snowflake" + ) + + MS_SQL = DataPlatformPair( + powerbi_data_platform_name="Sql", datahub_data_platform_name="mssql" + ) + + GOOGLE_BIGQUERY = DataPlatformPair( + powerbi_data_platform_name="GoogleBigQuery", + datahub_data_platform_name="bigquery", + ) + + AMAZON_REDSHIFT = DataPlatformPair( + powerbi_data_platform_name="AmazonRedshift", + datahub_data_platform_name="redshift", + ) + + @dataclass class PowerBiDashboardSourceReport(StaleEntityRemovalSourceReport): dashboards_scanned: int = 0 @@ -129,11 +171,21 @@ def report_number_of_workspaces(self, number_of_workspaces: int) -> None: self.number_of_workspaces = number_of_workspaces -@dataclass -class PlatformDetail: +def default_for_dataset_type_mapping() -> Dict[str, str]: + dict_: dict = {} + for item in SupportedDataPlatform: + dict_[ + item.value.powerbi_data_platform_name + ] = item.value.datahub_data_platform_name + + return dict_ + + +class PlatformDetail(ConfigModel): platform_instance: Optional[str] = pydantic.Field( default=None, - description="DataHub platform instance name. It should be same as you have used in ingestion receipe of DataHub platform ingestion source of particular platform", + description="DataHub platform instance name. It should be same as you have used in ingestion recipe of " + "DataHub platform ingestion source of particular platform", ) env: str = pydantic.Field( default=DEFAULT_ENV, @@ -171,7 +223,10 @@ class PowerBiDashboardSourceConfig( dataset_type_mapping: Union[ Dict[str, str], Dict[str, PlatformDetail] ] = pydantic.Field( - description="Mapping of PowerBI datasource type to DataHub supported data-sources. See Quickstart Recipe for mapping" + default_factory=default_for_dataset_type_mapping, + description="Mapping of PowerBI datasource type to DataHub supported data-sources. " + "You can configured platform instance for dataset lineage. " + "See Quickstart Recipe for mapping", ) # Azure app client identifier client_id: str = pydantic.Field(description="Azure app client identifier") diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 82968708a9505..53f46de535128 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -6,7 +6,11 @@ from lark import Tree -from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport +from datahub.ingestion.source.powerbi.config import ( + DataPlatformPair, + PowerBiDashboardSourceReport, + SupportedDataPlatform, +) from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function from datahub.ingestion.source.powerbi.m_query.data_classes import ( TRACE_POWERBI_MQUERY_PARSER, @@ -18,12 +22,6 @@ logger = logging.getLogger(__name__) -@dataclass -class DataPlatformPair: - datahub_data_platform_name: str - powerbi_data_platform_name: str - - @dataclass class DataPlatformTable: name: str @@ -31,34 +29,6 @@ class DataPlatformTable: data_platform_pair: DataPlatformPair -class SupportedDataPlatform(Enum): - POSTGRES_SQL = DataPlatformPair( - powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres" - ) - - ORACLE = DataPlatformPair( - powerbi_data_platform_name="Oracle", datahub_data_platform_name="oracle" - ) - - SNOWFLAKE = DataPlatformPair( - powerbi_data_platform_name="Snowflake", datahub_data_platform_name="snowflake" - ) - - MS_SQL = DataPlatformPair( - powerbi_data_platform_name="Sql", datahub_data_platform_name="mssql" - ) - - GOOGLE_BIGQUERY = DataPlatformPair( - powerbi_data_platform_name="GoogleBigQuery", - datahub_data_platform_name="bigquery", - ) - - AMAZON_REDSHIFT = DataPlatformPair( - powerbi_data_platform_name="AmazonRedshift", - datahub_data_platform_name="redshift", - ) - - class AbstractTableFullNameCreator(ABC): @abstractmethod def get_full_table_names( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 6bf1b72909bd5..b33adad565207 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -947,6 +947,10 @@ def validate_dataset_type_mapping(self): if key not in powerbi_data_platforms: raise ValueError(f"PowerBI DataPlatform {key} is not supported") + logger.debug( + f"Dataset lineage would get ingested for data-platform = {self.source_config.dataset_type_mapping}" + ) + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ Datahub Ingestion framework invoke this method diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 29f949fe3799e..cea7c471168cc 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,12 +1,17 @@ import logging import sys -from typing import Any, Dict +from typing import Any, Dict, cast from unittest import mock import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.powerbi.config import ( + PowerBiDashboardSourceConfig, + SupportedDataPlatform, +) +from datahub.ingestion.source.powerbi.powerbi import PowerBiDashboardSource from tests.test_helpers import mce_helpers FROZEN_TIME = "2022-02-03 07:00:00" @@ -978,3 +983,50 @@ def test_workspace_container( output_path=tmp_path / "powerbi_container_mces.json", golden_path=f"{test_resources_dir}/{mce_out_file}", ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +@pytest.mark.integration +def test_dataset_type_mapping_should_set_to_all( + mock_msal, pytestconfig, tmp_path, mock_time, requests_mock +): + """ + Here we don't need to run the pipeline. We need to verify dataset_type_mapping is set to default dataplatform + """ + register_mock_api(request_mock=requests_mock) + + new_config: dict = {**default_source_config()} + + del new_config["dataset_type_mapping"] + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **new_config, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_lower_case_urn_mces.json", + }, + }, + } + ) + source_config: PowerBiDashboardSourceConfig = cast( + PowerBiDashboardSource, pipeline.source + ).source_config + assert source_config.dataset_type_mapping is not None + + # Generate default dataset_type_mapping and compare it with source_config.dataset_type_mapping + default_dataset_type_mapping: dict = {} + for item in SupportedDataPlatform: + default_dataset_type_mapping[ + item.value.powerbi_data_platform_name + ] = item.value.datahub_data_platform_name + + assert default_dataset_type_mapping == source_config.dataset_type_mapping