From 7da23e4592e501e6d159d30260e0fdbd90e58139 Mon Sep 17 00:00:00 2001 From: Kirill Popov Date: Mon, 3 Jul 2023 21:20:40 +0300 Subject: [PATCH] feat(ingest): Add metabase name to platform instance mapping --- .../docs/sources/metabase/metabase.md | 8 ++++++ .../docs/sources/metabase/metabase.yml | 4 ++- .../src/datahub/ingestion/source/metabase.py | 25 ++++++++++++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/docs/sources/metabase/metabase.md b/metadata-ingestion/docs/sources/metabase/metabase.md index 74b7b73bbca0c3..a76786f7e5853a 100644 --- a/metadata-ingestion/docs/sources/metabase/metabase.md +++ b/metadata-ingestion/docs/sources/metabase/metabase.md @@ -9,6 +9,14 @@ the underlying datasets in the `glue` platform, the following snippet can be use DataHub will try to determine database name from Metabase [api/database](https://www.metabase.com/docs/latest/api-documentation.html#database) payload. However, the name can be overridden from `database_alias_map` for a given database connected to Metabase. +If several platform instances with the same platform (e.g. from several distinct clickhouse clusters) are present in DataHub, +the mapping between database id in Metabase and platform instance in DataHub may be configured with the following map: +```yml + database_id_to_instance_map: + "42": platform_instance_in_datahub +``` +The key in this map must be string, not integer although Metabase API provides `id` as number. +If `database_id_to_instance_map` is not specified, `platform_instance_map` is used for platform instance mapping. If none of the above are specified, platform instance is not used when constructing `urn` when searching for dataset relations. ## Compatibility Metabase version [v0.41.2](https://www.metabase.com/start/oss/) diff --git a/metadata-ingestion/docs/sources/metabase/metabase.yml b/metadata-ingestion/docs/sources/metabase/metabase.yml index e0ef6b4ba72fbc..cc2aed9f8bce05 100644 --- a/metadata-ingestion/docs/sources/metabase/metabase.yml +++ b/metadata-ingestion/docs/sources/metabase/metabase.yml @@ -15,6 +15,8 @@ source: # Optional mapping of platform types to instance ids platform_instance_map: # optional postgres: test_postgres # optional + database_id_to_instance_map: # optional + "42": platform_instance_in_datahub # optional sink: - # sink configs \ No newline at end of file + # sink configs diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index 0a6d8c605688a2..254a6f93c8676a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -60,6 +60,10 @@ class MetabaseConfig(DatasetLineageProviderConfigBase): default=None, description="Custom mappings between metabase database engines and DataHub platforms", ) + database_id_to_instance_map: Optional[Dict[str, str]] = Field( + default=None, + description="Custom mappings between metabase database id and DataHub platform instance", + ) default_schema: str = Field( default="public", description="Default schema name to use when schema is not provided in an SQL query", @@ -272,6 +276,16 @@ def _get_ownership(self, creator_id: int) -> Optional[OwnershipClass]: user_info_response.raise_for_status() user_details = user_info_response.json() except HTTPError as http_error: + if ( + http_error.response is not None + and http_error.response.status_code == 404 + ): + self.report.report_warning( + key=f"metabase-user-{creator_id}", + reason=f"User {creator_id} is blocked in Metabase or missing", + ) + return None + # For cases when the error is not 404 but something else self.report.report_failure( key=f"metabase-user-{creator_id}", reason=f"Unable to retrieve User info. " f"Reason: {str(http_error)}", @@ -564,10 +578,19 @@ def get_datasource_from_id(self, datasource_id): reason=f"Platform was not found in DataHub. Using {platform} name as is", ) + # For cases when metabase has several platform instances (e.g. several individual ClickHouse clusters) + datasource_id_in_metabase = dataset_json.get("id") + platform_instance = ( + self.config.database_id_to_instance_map.get(str(datasource_id_in_metabase)) + if datasource_id_in_metabase and self.config.database_id_to_instance_map + else None + ) + + # If Metabase datasource ID is not mapped to platform instace, fall back to platform mapping # Set platform_instance if configuration provides a mapping from platform name to instance platform_instance = ( self.config.platform_instance_map.get(platform) - if self.config.platform_instance_map + if self.config.platform_instance_map and platform_instance is None else None )