From f8ff3f140de5dc9f1104909cd133d7839800a510 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Mon, 22 May 2023 17:07:43 -0700 Subject: [PATCH 1/6] refactor(ingest): Call source_helpers via new WorkUnitProcessors in base Source --- .../src/datahub/ingestion/api/source.py | 53 ++++++++++++++++- .../datahub/ingestion/api/source_helpers.py | 8 +-- .../src/datahub/ingestion/source/aws/glue.py | 25 +++----- .../ingestion/source/bigquery_v2/bigquery.py | 51 +++++++--------- .../ingestion/source/dbt/dbt_common.py | 45 +++++++------- .../src/datahub/ingestion/source/file.py | 8 ++- .../ingestion/source/gcs/gcs_source.py | 38 ++++-------- .../ingestion/source/iceberg/iceberg.py | 32 +++------- .../ingestion/source/identity/azure_ad.py | 38 +++++------- .../datahub/ingestion/source/identity/okta.py | 34 +++-------- .../src/datahub/ingestion/source/kafka.py | 32 +++------- .../datahub/ingestion/source/kafka_connect.py | 33 +++-------- .../src/datahub/ingestion/source/ldap.py | 31 +++------- .../ingestion/source/looker/looker_source.py | 30 +++------- .../ingestion/source/looker/lookml_source.py | 31 +++------- .../ingestion/source/powerbi/powerbi.py | 58 +++++++------------ .../src/datahub/ingestion/source/pulsar.py | 31 +++------- .../ingestion/source/redshift/redshift.py | 36 ++++-------- .../src/datahub/ingestion/source/s3/source.py | 37 +++--------- .../ingestion/source/schema/json_schema.py | 30 +++------- .../source/snowflake/snowflake_v2.py | 37 ++++-------- .../ingestion/source/sql/sql_common.py | 35 ++++------- .../state/stale_entity_removal_handler.py | 18 ++++++ .../src/datahub/ingestion/source/superset.py | 33 +++-------- .../src/datahub/ingestion/source/tableau.py | 35 ++++------- .../datahub/ingestion/source/unity/source.py | 33 +++-------- .../integration/azure_ad/test_azure_ad.py | 16 +---- .../tests/integration/iceberg/test_iceberg.py | 15 +---- .../tests/integration/integration_helpers.py | 19 ++++++ .../kafka-connect/test_kafka_connect.py | 15 +---- .../integration/kafka/test_kafka_state.py | 16 +---- .../integration/ldap/test_ldap_stateful.py | 14 +---- .../tests/integration/looker/test_looker.py | 12 +--- .../tests/integration/lookml/test_lookml.py | 14 +---- .../tests/integration/okta/test_okta.py | 15 +---- .../integration/superset/test_superset.py | 15 +---- .../tableau/test_tableau_ingest.py | 14 +---- .../tests/unit/test_glue_source.py | 12 +--- metadata-ingestion/tests/unit/test_source.py | 2 +- 39 files changed, 359 insertions(+), 692 deletions(-) create mode 100644 metadata-ingestion/tests/integration/integration_helpers.py diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 7c19b4a3b1406..06dce219da34d 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -3,7 +3,20 @@ from collections import defaultdict from dataclasses import dataclass, field from enum import Enum -from typing import Dict, Generic, Iterable, Optional, Set, Type, TypeVar, Union, cast +from functools import partial +from typing import ( + Callable, + Dict, + Generic, + Iterable, + Optional, + Sequence, + Set, + Type, + TypeVar, + Union, + cast, +) from pydantic import BaseModel @@ -12,6 +25,11 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit from datahub.ingestion.api.report import Report +from datahub.ingestion.api.source_helpers import ( + auto_materialize_referenced_tags, + auto_status_aspect, + auto_workunit_reporter, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent from datahub.utilities.lossy_collections import LossyDict, LossyList @@ -118,6 +136,9 @@ class TestConnectionReport(Report): WorkUnitType = TypeVar("WorkUnitType", bound=WorkUnit) ExtractorConfig = TypeVar("ExtractorConfig", bound=ConfigModel) +WorkUnitProcessor = Callable[[Iterable[WorkUnitType]], Iterable[WorkUnitType]] +MetadataWorkUnitProcessor = WorkUnitProcessor[MetadataWorkUnit] + class Extractor(Generic[WorkUnitType, ExtractorConfig], Closeable, metaclass=ABCMeta): ctx: PipelineContext @@ -155,9 +176,35 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": # can't make this method abstract. raise NotImplementedError('sources must implement "create"') - @abstractmethod + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + """A list of functions that transforms the workunits produced by this source. + Run in order, first in list is applied first. Be careful with order when overriding. + """ + return [ + auto_status_aspect, + auto_materialize_referenced_tags, + partial(auto_workunit_reporter, self.get_report()), + ] + + @staticmethod + def _apply_workunit_processors( + workunit_processors: Sequence[Optional[MetadataWorkUnitProcessor]], + stream: Iterable[MetadataWorkUnit], + ) -> Iterable[MetadataWorkUnit]: + for processor in workunit_processors: + if processor is not None: + stream = processor(stream) + return stream + def get_workunits(self) -> Iterable[MetadataWorkUnit]: - pass + return self._apply_workunit_processors( + self.get_workunit_processors(), self.get_workunits_internal() + ) + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + raise NotImplementedError( + "get_workunits_internal must be implemented if get_workunits is not overriden." + ) @abstractmethod def get_report(self) -> SourceReport: diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 6b311cb8ade64..6e27f1f95399c 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -1,7 +1,6 @@ from typing import Callable, Iterable, Optional, Set, TypeVar, Union from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.common import WorkUnit from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -104,7 +103,7 @@ def auto_stale_entity_removal( yield from stale_entity_removal_handler.gen_removed_entity_workunits() -T = TypeVar("T", bound=WorkUnit) +T = TypeVar("T", bound=MetadataWorkUnit) def auto_workunit_reporter(report: SourceReport, stream: Iterable[T]) -> Iterable[T]: @@ -119,14 +118,9 @@ def auto_workunit_reporter(report: SourceReport, stream: Iterable[T]) -> Iterabl def auto_materialize_referenced_tags( stream: Iterable[MetadataWorkUnit], - active: bool = True, ) -> Iterable[MetadataWorkUnit]: """For all references to tags, emit a tag key aspect to ensure that the tag exists in our backend.""" - if not active: - yield from stream - return - referenced_tags = set() tags_with_aspects = set() diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 83273516ea819..db0b51a10fce7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -10,6 +10,7 @@ List, Mapping, Optional, + Sequence, Set, Tuple, Union, @@ -48,6 +49,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.source_helpers import ( auto_stale_entity_removal, auto_status_aspect, @@ -273,15 +275,6 @@ def __init__(self, config: GlueSourceConfig, ctx: PipelineContext): self.extract_transforms = config.extract_transforms self.env = config.env - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.source_config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - def get_glue_arn( self, account_id: str, database: str, table: Optional[str] = None ) -> str: @@ -919,13 +912,13 @@ def _get_domain_wu( domain_urn=domain_urn, ) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: database_seen = set() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 18b02dabd5ccf..243e5a089fc35 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -5,7 +5,18 @@ import traceback from collections import defaultdict from datetime import datetime, timedelta, timezone -from typing import Dict, Iterable, List, Optional, Set, Tuple, Type, Union, cast +from typing import ( + Dict, + Iterable, + List, + Optional, + Sequence, + Set, + Tuple, + Type, + Union, + cast, +) from google.cloud import bigquery from google.cloud.bigquery.table import TableListItem @@ -30,16 +41,11 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, SourceCapability, TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import ( - auto_materialize_referenced_tags, - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigqueryTableIdentifier, @@ -80,9 +86,6 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantRunSkipHandler, ) -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -228,15 +231,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.lineage_extractor = BigqueryLineageExtractor(config, self.report) self.usage_extractor = BigQueryUsageExtractor(config, self.report) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - self.domain_registry: Optional[DomainRegistry] = None if self.config.domain: self.domain_registry = DomainRegistry( @@ -491,6 +485,14 @@ def gen_dataset_containers( tags=tags_joined, ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: conn: bigquery.Client = get_bigquery_client(self.config) self.add_config_to_report() @@ -514,17 +516,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage(project.id, "Lineage Extraction") yield from self.generate_lineage(project.id) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_materialize_referenced_tags( - auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.get_workunits_internal()), - ), - ) - ) - def _should_ingest_usage(self) -> bool: if not self.config.include_usage_statistics: return False diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 5dbe3ba4e67de..26781e8c0a431 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -5,7 +5,18 @@ from dataclasses import dataclass, field from datetime import datetime from enum import auto -from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Tuple, Union +from typing import ( + Any, + Callable, + ClassVar, + Dict, + Iterable, + List, + Optional, + Sequence, + Tuple, + Union, +) import pydantic from pydantic import root_validator, validator @@ -31,12 +42,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source_helpers import ( - auto_materialize_referenced_tags, - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.sql.sql_types import ( @@ -50,7 +56,6 @@ resolve_trino_modified_type, resolve_vertica_modified_type, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -688,12 +693,8 @@ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str) self.config.owner_extraction_pattern ) # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, + self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( + self, self.config, ctx ) def create_test_entity_mcps( @@ -878,15 +879,13 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # return dbt nodes + global custom properties raise NotImplementedError() - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_materialize_referenced_tags( - auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.write_semantics == "PATCH" and not self.ctx.graph: diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index f37f5901e1fc9..2829bd5cd7bdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -6,8 +6,9 @@ from collections import defaultdict from dataclasses import dataclass, field from enum import auto +from functools import partial from io import BufferedReader -from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Union +from typing import Any, Dict, Iterable, Iterator, Optional, Sequence, Tuple, Union from urllib import parse import ijson @@ -28,6 +29,7 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, SourceReport, TestableSource, TestConnectionReport, @@ -205,8 +207,8 @@ def get_filenames(self) -> Iterable[str]: self.report.total_num_files = 1 return [str(self.config.path)] - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_workunit_reporter(self.report, self.get_workunits_internal()) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [partial(auto_workunit_reporter, self.report)] def get_workunits_internal( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py index e0cf0e70244e6..99adc9d38e7ad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List, Optional +from typing import Dict, Iterable, List, Optional, Sequence from urllib.parse import unquote from pydantic import Field, SecretStr, validator @@ -14,13 +14,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceCapability -from datahub.ingestion.api.source_helpers import ( - auto_materialize_referenced_tags, - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin @@ -29,7 +23,6 @@ from datahub.ingestion.source.s3.config import DataLakeSourceConfig from datahub.ingestion.source.s3.report import DataLakeSourceReport from datahub.ingestion.source.s3.source import S3Source -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StatefulStaleMetadataRemovalConfig, @@ -123,13 +116,6 @@ def __init__(self, config: GCSSourceConfig, ctx: PipelineContext): self.config = config self.report = GCSSourceReport() self.s3_source = self.create_equivalent_s3_source(ctx) - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) @classmethod def create(cls, config_dict, ctx): @@ -184,16 +170,16 @@ def s3_source_overrides(self, source: S3Source) -> S3Source: return source - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_materialize_referenced_tags( - auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.s3_source.get_workunits_internal()), - ), - ) - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + return self.s3_source.get_workunits_internal() def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index ee5f481dac4f9..eaede94774679 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -1,7 +1,7 @@ import json import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple from iceberg.api import types as IcebergTypes from iceberg.api.table import Table @@ -27,12 +27,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceReport -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.iceberg.iceberg_common import ( @@ -40,7 +35,6 @@ IcebergSourceReport, ) from datahub.ingestion.source.iceberg.iceberg_profiler import IcebergProfiler -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -120,26 +114,18 @@ def __init__(self, config: IcebergSourceConfig, ctx: PipelineContext) -> None: self.config: IcebergSourceConfig = config self.iceberg_client: FilesystemTables = config.filesystem_tables - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - @classmethod def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource": config = IcebergSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: for dataset_path, dataset_name in self.config.get_paths(): # Tuple[str, str] diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index e1d78dc25c304..fb0acc0a0e270 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -4,7 +4,7 @@ import urllib from collections import defaultdict from dataclasses import dataclass, field -from typing import Any, Dict, Generator, Iterable, List, Optional +from typing import Any, Dict, Generator, Iterable, List, Optional, Sequence import click import requests @@ -22,16 +22,12 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceCapability, SourceReport -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, +from datahub.ingestion.api.source import ( + MetadataWorkUnitProcessor, + SourceCapability, + SourceReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -283,14 +279,6 @@ def __init__(self, config: AzureADConfig, ctx: PipelineContext): self.token = self.get_token() self.selected_azure_ad_groups: list = [] self.azure_ad_groups_users: list = [] - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=ctx.pipeline_name, - run_id=ctx.run_id, - ) def get_token(self): token_response = requests.post(self.config.token_url, data=self.token_data) @@ -307,6 +295,14 @@ def get_token(self): click.echo("Error: Token response invalid") exit() + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # for future developers: The actual logic of this ingestion wants to be executed, in order: # 1) the groups @@ -395,14 +391,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: datahub_corp_user_urn_to_group_membership, ) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) - def _add_group_members_to_group_membership( self, parent_corp_group_urn: str, diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index 0ee2a40c240c2..a5ba46be30800 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -5,7 +5,7 @@ from collections import defaultdict from dataclasses import dataclass, field from time import sleep -from typing import Dict, Iterable, List, Optional, Union +from typing import Dict, Iterable, List, Optional, Sequence, Union from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException @@ -24,15 +24,8 @@ platform_name, support_status, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -295,14 +288,13 @@ def __init__(self, config: OktaConfig, ctx: PipelineContext): self.report = OktaSourceReport() self.okta_client = self._create_okta_client() - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=ctx.pipeline_name, - run_id=ctx.run_id, - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Step 0: get or create the event loop @@ -416,14 +408,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # Step 4: Close the event loop event_loop.close() - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) - def get_report(self): return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index fec12166daee9..fef9fff16e11e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -3,7 +3,7 @@ import logging from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, Iterable, List, Optional, Type +from typing import Any, Dict, Iterable, List, Optional, Sequence, Type import confluent_kafka import confluent_kafka.admin @@ -35,16 +35,10 @@ support_status, ) from datahub.ingestion.api.registry import import_path -from datahub.ingestion.api.source import SourceCapability -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.kafka_schema_registry_base import KafkaSchemaRegistryBase -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -168,14 +162,6 @@ def __init__(self, config: KafkaSourceConfig, ctx: PipelineContext): cached_domains=[k for k in self.source_config.domain], graph=self.ctx.graph, ) - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.source_config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) def init_kafka_admin_client(self) -> None: try: @@ -199,13 +185,13 @@ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource": config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: topics = self.consumer.list_topics( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index fba9d697e9018..c52bbf0f11ce9 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -2,7 +2,7 @@ import re import sys from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Sequence, Tuple import jpype import jpype.imports @@ -27,15 +27,9 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import Source -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -939,15 +933,6 @@ def __init__(self, config: KafkaConnectSourceConfig, ctx: PipelineContext): } ) - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - # Test the connection if self.config.username is not None and self.config.password is not None: logger.info( @@ -1158,13 +1143,13 @@ def construct_job_workunits( ), ).as_workunit() - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: connectors_manifest = self.get_connectors_manifest() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 0832cd3314767..4065ae286df3c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -1,6 +1,6 @@ """LDAP Source""" import dataclasses -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Sequence import ldap from ldap.controls import SimplePagedResultsControl @@ -15,13 +15,8 @@ platform_name, support_status, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -189,14 +184,6 @@ def __init__(self, ctx: PipelineContext, config: LDAPSourceConfig): super(LDAPSource, self).__init__(config, ctx) self.config = config - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - # ensure prior defaults are in place for k in user_attrs_map: if k not in self.config.user_attrs_map: @@ -229,13 +216,13 @@ def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "LDAPSourc config = LDAPSourceConfig.parse_obj(config_dict) return cls(ctx, config) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """Returns an Iterable containing the workunits to ingest LDAP users or groups.""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 9538aadd5e9d3..b3cb0a2e09dd8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -38,16 +38,12 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, SourceCapability, SourceReport, TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.looker import looker_usage from datahub.ingestion.source.looker.looker_common import ( @@ -68,7 +64,6 @@ LookerAPI, LookerAPIConfig, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StatefulStaleMetadataRemovalConfig, @@ -236,15 +231,6 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): config=stat_generator_config, ) - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.source_config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: test_report = TestConnectionReport() @@ -1185,13 +1171,13 @@ def extract_usage_stat( return mcps - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.reporter, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.reporter.report_stage_start("list_dashboards") diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 55615f2c1d290..f2b936de78cd4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -14,6 +14,7 @@ Iterable, List, Optional, + Sequence, Set, Tuple, Type, @@ -45,12 +46,7 @@ support_status, ) from datahub.ingestion.api.registry import import_path -from datahub.ingestion.api.source import SourceCapability -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes from datahub.ingestion.source.git.git_import import GitClone @@ -68,7 +64,6 @@ LookerAPIConfig, TransportOptionsConfig, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -1473,14 +1468,6 @@ def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext): raise ValueError( "Failed to retrieve connections from looker client. Please check to ensure that you have manage_models permission enabled on this API key." ) - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.source_config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) def _load_model(self, path: str) -> LookerModel: with open(path, "r") as file: @@ -1795,13 +1782,13 @@ def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManife else: return None - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.reporter, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: with tempfile.TemporaryDirectory("lookml_tmp") as tmp_dir: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 0603699723ce8..22968db3c5120 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -4,7 +4,7 @@ # ######################################################### import logging -from typing import Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, List, Optional, Sequence, Set, Tuple, Union import datahub.emitter.mce_builder as builder import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes @@ -19,12 +19,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceReport -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( BIContainerSubTypes, @@ -42,9 +37,6 @@ ) from datahub.ingestion.source.powerbi.m_query import parser, resolver from datahub.ingestion.source.powerbi.rest_api_wrapper.powerbi_api import PowerBiAPI -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -1113,12 +1105,8 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext): self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver) # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.source_config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=ctx.pipeline_name, - run_id=ctx.run_id, + self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx ) @classmethod @@ -1197,6 +1185,17 @@ def get_workspace_workunit( for workunit in dataset_workunits: yield workunit + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id + # This will result in creating checkpoint for each workspace_id + if self.source_config.modified_since: + return [] # Handle these in get_workunits_internal + else: + return [ + *super().get_workunit_processors(), + self.stale_entity_removal_handler.workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ Datahub Ingestion framework invoke this method @@ -1232,31 +1231,16 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.stale_entity_removal_handler ) - yield from auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.reporter, - auto_status_aspect(self.get_workspace_workunit(workspace)), - ), + yield from self._apply_workunit_processors( + [ + *super().get_workunit_processors(), + self.stale_entity_removal_handler.workunit_processor, + ], + self.get_workspace_workunit(workspace), ) else: # Maintain backward compatibility yield from self.get_workspace_workunit(workspace) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - # As modified_workspaces is not idempotent, hence auto_stale_entity_removal is run later for each workspace_id - # This will result in creating checkpoint for each workspace_id - if self.source_config.modified_since: - return self.get_workunits_internal() - else: - # Since we only run for a fixed list of workspace_ids - # This will result in one checkpoint for the list of configured workspace_ids - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.reporter, auto_status_aspect(self.get_workunits_internal()) - ), - ) - def get_report(self) -> SourceReport: return self.reporter diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index 6bd256e4e9708..306e22cd5be5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from hashlib import md5 -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, List, Optional, Sequence, Tuple import requests @@ -25,15 +25,10 @@ platform_name, support_status, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.common.subtypes import DatasetSubTypes -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -103,14 +98,6 @@ def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): self.platform: str = "pulsar" self.config: PulsarSourceConfig = config self.report: PulsarSourceReport = PulsarSourceReport() - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) self.base_url: str = f"{self.config.web_service_url}/admin/v2" self.tenants: List[str] = config.tenants @@ -235,13 +222,13 @@ def create(cls, config_dict, ctx): return cls(config, ctx) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 865d3b6940994..6d53c51daedd7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict -from typing import Dict, Iterable, List, Optional, Type, Union +from typing import Dict, Iterable, List, Optional, Sequence, Type, Union import humanfriendly @@ -26,14 +26,10 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, @@ -68,9 +64,6 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantRunSkipHandler, ) -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -294,14 +287,6 @@ def __init__(self, config: RedshiftConfig, ctx: PipelineContext): self.config: RedshiftConfig = config self.report: RedshiftReport = RedshiftReport() self.platform = "redshift" - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) self.domain_registry = None if self.config.domain: self.domain_registry = DomainRegistry( @@ -352,15 +337,6 @@ def get_redshift_connection( return conn - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.get_workunits_internal()), - ), - ) - def gen_database_container(self, database: str) -> Iterable[MetadataWorkUnit]: database_container_key = gen_database_key( database=database, @@ -375,6 +351,14 @@ def gen_database_container(self, database: str) -> Iterable[MetadataWorkUnit]: sub_types=[DatasetContainerSubTypes.DATABASE], ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: connection = RedshiftSource.get_redshift_connection(self.config) database = get_db_name(self.config) diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 3cf155d304bf8..4f4d2eeda2831 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -6,7 +6,7 @@ import time from collections import OrderedDict from datetime import datetime -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple import pydeequ from more_itertools import peekable @@ -50,13 +50,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceReport -from datahub.ingestion.api.source_helpers import ( - auto_materialize_referenced_tags, - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_boto_utils import get_s3_tags, list_folders from datahub.ingestion.source.aws.s3_util import ( @@ -70,7 +64,6 @@ from datahub.ingestion.source.s3.profiling import _SingleTableProfiler from datahub.ingestion.source.s3.report import DataLakeSourceReport from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -257,15 +250,6 @@ def __init__(self, config: DataLakeSourceConfig, ctx: PipelineContext): config_report, ) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - if config.profiling.enabled: telemetry.telemetry_instance.ping( "data_lake_profiling_config", @@ -875,16 +859,13 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: }, ) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_materialize_referenced_tags( - auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.get_workunits_internal()), - ), - ) - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] def is_s3_platform(self): return self.source_config.platform == "s3" diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py index bdea3381ef968..3e4ebef4ce8fa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from os.path import basename, dirname from pathlib import Path -from typing import Any, Iterable, Optional, Union +from typing import Any, Iterable, Optional, Sequence, Union import jsonref from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator @@ -30,12 +30,7 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import SourceCapability -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, SourceCapability from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor.json_ref_patch import title_swapping_callback from datahub.ingestion.extractor.json_schema_util import ( @@ -239,13 +234,6 @@ def stringreplaceloader(match_string, replace_string, uri, **kwargs): def __init__(self, ctx: PipelineContext, config: JsonSchemaSourceConfig): super(JsonSchemaSource, self).__init__(ctx=ctx, config=config) self.config = config - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=JsonSchemaCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) self.report = StaleEntityRemovalSourceReport() def _load_one_file( @@ -330,13 +318,13 @@ def _load_one_file( ), ).as_workunit() - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.config.uri_replace_pattern: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 48c2acc066583..234e8bcf86398 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,7 +4,7 @@ import os.path import platform from dataclasses import dataclass -from typing import Callable, Dict, Iterable, List, Optional, Union, cast +from typing import Callable, Dict, Iterable, List, Optional, Sequence, Union, cast import pandas as pd from snowflake.connector import SnowflakeConnection @@ -28,17 +28,13 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, Source, SourceCapability, SourceReport, TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.glossary.classification_mixin import ClassificationHandler from datahub.ingestion.source.common.subtypes import ( @@ -98,9 +94,6 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantRunSkipHandler, ) -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -229,15 +222,6 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.snowsight_base_url: Optional[str] = None self.connection: Optional[SnowflakeConnection] = None - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - self.redundant_run_skip_handler = RedundantRunSkipHandler( source=self, config=self.config, @@ -481,6 +465,14 @@ def query(query): return _report + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self._snowflake_clear_ocsp_cache() @@ -586,15 +578,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.usage_extractor.get_workunits(discovered_datasets) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.get_workunits_internal()), - ), - ) - def report_warehouse_failure(self): if self.config.warehouse is not None: self.report_error( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 4480e09cb0201..8e5f61242a801 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -11,6 +11,7 @@ Iterable, List, Optional, + Sequence, Set, Tuple, Type, @@ -32,11 +33,7 @@ ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, @@ -51,9 +48,6 @@ gen_schema_key, get_domain_wu, ) -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -325,15 +319,6 @@ def __init__(self, config: SQLAlchemyConfig, ctx: PipelineContext, platform: str self.platform = platform self.report: SQLSourceReport = SQLSourceReport() - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=BaseSQLAlchemyCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - config_report = { config_option: config.dict().get(config_option) for config_option in config_options_to_report @@ -474,6 +459,14 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: sql_config = self.config if logger.isEnabledFor(logging.DEBUG): @@ -525,14 +518,6 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit profile_requests, profiler, platform=self.platform ) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) - def standardize_schema_table_names( self, schema: str, entity: str ) -> Tuple[str, str]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py index 9303a6af265ca..169dd645efd8f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py @@ -1,6 +1,7 @@ import logging from abc import ABC, abstractmethod from dataclasses import dataclass, field +from functools import partial from typing import ( Dict, Generic, @@ -17,9 +18,12 @@ import pydantic from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import JobId +from datahub.ingestion.api.source_helpers import auto_stale_entity_removal from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.state.checkpoint import Checkpoint, CheckpointStateBase +from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfig, StatefulIngestionConfigBase, @@ -176,6 +180,20 @@ def __init__( self._urns_to_skip: Set[str] = set() self.source.register_stateful_ingestion_usecase_handler(self) + @classmethod + def create( + cls, + source: StatefulIngestionSourceBase, + config: StatefulIngestionConfigBase, + ctx: PipelineContext, + state_type_class: Type[StaleEntityCheckpointStateBase] = GenericCheckpointState, + ) -> "StaleEntityRemovalHandler": + return cls(source, config, state_type_class, ctx.pipeline_name, ctx.run_id) + + @property + def workunit_processor(self): + return partial(auto_stale_entity_removal, self) + @classmethod def compute_job_id( cls, platform: Optional[str], unique_id: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index f2793d7afce5f..bad995ef322bd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -1,7 +1,7 @@ import json import logging from functools import lru_cache -from typing import Dict, Iterable, Optional +from typing import Dict, Iterable, Optional, Sequence import dateutil.parser as dp import requests @@ -18,15 +18,9 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import Source -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.sql import sql_common -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -194,15 +188,6 @@ def __init__(self, ctx: PipelineContext, config: SupersetConfig): pass # TODO(Gabe): how should we message about this error? - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: config = SupersetConfig.parse_obj(config_dict) @@ -416,13 +401,13 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.emit_dashboard_mces() yield from self.emit_chart_mces() - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_report(self) -> StaleEntityRemovalSourceReport: return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 343499973f9b8..07defbd7565bf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from datetime import datetime from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast import dateutil.parser as dp import tableauserverclient as TSC @@ -46,19 +46,13 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import Source -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source import tableau_constant from datahub.ingestion.source.common.subtypes import ( BIContainerSubTypes, DatasetSubTypes, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -439,15 +433,6 @@ def __init__( # when emitting custom SQL data sources. self.custom_sql_ids_being_used: List[str] = [] - # Create and register the stateful ingestion use-case handlers. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - self._authenticate() def close(self) -> None: @@ -2256,14 +2241,6 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: config = TableauConfig.parse_obj(config_dict) return cls(config, ctx) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, auto_status_aspect(self.get_workunits_internal()) - ), - ) - def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: for _id, project in self.tableau_project_registry.items(): yield from gen_containers( @@ -2288,6 +2265,14 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: sub_types=[tableau_constant.PROJECT], ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: if self.server is None or not self.server.is_signed_in(): return diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 6a5d73bbb726b..b2cbfe721c737 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -2,7 +2,7 @@ import re import time from datetime import timedelta -from typing import Dict, Iterable, List, Optional, Set +from typing import Dict, Iterable, List, Optional, Sequence, Set from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -30,21 +30,16 @@ ) from datahub.ingestion.api.source import ( CapabilityReport, + MetadataWorkUnitProcessor, SourceCapability, TestableSource, TestConnectionReport, ) -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, ) -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -145,15 +140,6 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): else config.workspace_url.split("//")[1].split(".")[0] ) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler( - source=self, - config=self.config, - state_type_class=GenericCheckpointState, - pipeline_name=self.ctx.pipeline_name, - run_id=self.ctx.run_id, - ) - if self.config.domain: self.domain_registry = DomainRegistry( cached_domains=[k for k in self.config.domain], graph=self.ctx.graph @@ -211,14 +197,13 @@ def create(cls, config_dict, ctx): config = UnityCatalogSourceConfig.parse_obj(config_dict) return cls(ctx=ctx, config=config) - def get_workunits(self) -> Iterable[MetadataWorkUnit]: - return auto_stale_entity_removal( - self.stale_entity_removal_handler, - auto_workunit_reporter( - self.report, - auto_status_aspect(self.get_workunits_internal()), - ), - ) + def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: wait_on_warehouse = None diff --git a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py index 42b9da3b98904..82d2138057541 100644 --- a/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py +++ b/metadata-ingestion/tests/integration/azure_ad/test_azure_ad.py @@ -1,15 +1,14 @@ import json import pathlib from functools import partial -from typing import List, Optional, cast +from typing import List from unittest.mock import patch from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.identity.azure_ad import AzureADConfig, AzureADSource -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState +from datahub.ingestion.source.identity.azure_ad import AzureADConfig from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( validate_all_providers_have_committed_successfully, @@ -328,15 +327,6 @@ def test_azure_source_ingestion_disabled(pytestconfig, mock_datahub_graph, tmp_p ) -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - azure_ad_source = cast(AzureADSource, pipeline.source) - return azure_ad_source.get_current_checkpoint( - azure_ad_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) def test_azure_ad_stateful_ingestion( pytestconfig, tmp_path, mock_time, mock_datahub_graph diff --git a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py index 032ba93b1b4c3..d0181ab5dc292 100644 --- a/metadata-ingestion/tests/integration/iceberg/test_iceberg.py +++ b/metadata-ingestion/tests/integration/iceberg/test_iceberg.py @@ -1,16 +1,14 @@ from pathlib import PosixPath -from typing import Any, Dict, Optional, Union, cast +from typing import Any, Dict, Union from unittest.mock import patch import pytest from freezegun import freeze_time from iceberg.core.filesystem.file_status import FileStatus from iceberg.core.filesystem.local_filesystem import LocalFileSystem +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.iceberg.iceberg import IcebergSource -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( run_and_get_pipeline, @@ -22,15 +20,6 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - iceberg_source = cast(IcebergSource, pipeline.source) - return iceberg_source.get_current_checkpoint( - iceberg_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_iceberg_ingest(pytestconfig, tmp_path, mock_time): diff --git a/metadata-ingestion/tests/integration/integration_helpers.py b/metadata-ingestion/tests/integration/integration_helpers.py new file mode 100644 index 0000000000000..41a5251354e85 --- /dev/null +++ b/metadata-ingestion/tests/integration/integration_helpers.py @@ -0,0 +1,19 @@ +from typing import Optional, cast + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.state.checkpoint import Checkpoint +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) + + +def get_current_checkpoint_from_pipeline( + pipeline: Pipeline, +) -> Optional[Checkpoint]: + source = cast(StatefulIngestionSourceBase, pipeline.source) + return source.get_current_checkpoint( + StaleEntityRemovalHandler.compute_job_id(getattr(source, "platform", "default")) + ) diff --git a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py index b229fc8beebce..ea1b5baa54dbf 100644 --- a/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py +++ b/metadata-ingestion/tests/integration/kafka-connect/test_kafka_connect.py @@ -1,14 +1,14 @@ import subprocess import time -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, List, cast from unittest import mock import pytest import requests from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd @@ -489,14 +489,3 @@ def test_kafka_connect_ingest_stateful( "urn:li:dataJob:(urn:li:dataFlow:(kafka-connect,connect-instance-1.mysql_source2,PROD),librarydb.member)", ] assert sorted(deleted_job_urns) == sorted(difference_job_urns) - - -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint]: - from datahub.ingestion.source.kafka_connect import KafkaConnectSource - - kafka_connect_source = cast(KafkaConnectSource, pipeline.source) - return kafka_connect_source.get_current_checkpoint( - kafka_connect_source.stale_entity_removal_handler.job_id - ) diff --git a/metadata-ingestion/tests/integration/kafka/test_kafka_state.py b/metadata-ingestion/tests/integration/kafka/test_kafka_state.py index 52940696df45c..4c065bd056f21 100644 --- a/metadata-ingestion/tests/integration/kafka/test_kafka_state.py +++ b/metadata-ingestion/tests/integration/kafka/test_kafka_state.py @@ -1,15 +1,12 @@ import time -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, List from unittest.mock import patch import pytest from confluent_kafka.admin import AdminClient, NewTopic from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline -from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.kafka import KafkaSource -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers.docker_helpers import wait_for_port from tests.test_helpers.state_helpers import ( run_and_get_pipeline, @@ -81,15 +78,6 @@ def __exit__(self, exc_type, exc, traceback): self.delete_kafka_topics(self.topics) -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - kafka_source = cast(KafkaSource, pipeline.source) - return kafka_source.get_current_checkpoint( - kafka_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_kafka_ingest_with_stateful( diff --git a/metadata-ingestion/tests/integration/ldap/test_ldap_stateful.py b/metadata-ingestion/tests/integration/ldap/test_ldap_stateful.py index 1f6fa3d40530c..aceed54dce906 100644 --- a/metadata-ingestion/tests/integration/ldap/test_ldap_stateful.py +++ b/metadata-ingestion/tests/integration/ldap/test_ldap_stateful.py @@ -1,15 +1,12 @@ import pathlib import time -from typing import Optional, cast from unittest import mock import pytest from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.ldap import LDAPSource -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers import mce_helpers from tests.test_helpers.docker_helpers import wait_for_port from tests.test_helpers.state_helpers import ( @@ -90,15 +87,6 @@ def ldap_ingest_common( return pipeline -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - ldap_source = cast(LDAPSource, pipeline.source) - return ldap_source.get_current_checkpoint( - ldap_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_ldap_stateful( diff --git a/metadata-ingestion/tests/integration/looker/test_looker.py b/metadata-ingestion/tests/integration/looker/test_looker.py index 3e1618ecfc112..c3292376bfd8d 100644 --- a/metadata-ingestion/tests/integration/looker/test_looker.py +++ b/metadata-ingestion/tests/integration/looker/test_looker.py @@ -5,6 +5,7 @@ from unittest import mock from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from looker_sdk.rtl import transport from looker_sdk.rtl.transport import TransportOptions from looker_sdk.sdk.api40.models import ( @@ -27,8 +28,6 @@ LookViewField, UserViewField, ) -from datahub.ingestion.source.looker.looker_source import LookerDashboardSource -from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( @@ -719,12 +718,3 @@ def looker_source_config(sink_file_name): assert len(difference_dashboard_urns) == 1 deleted_dashboard_urns = ["urn:li:dashboard:(looker,dashboards.11)"] assert sorted(deleted_dashboard_urns) == sorted(difference_dashboard_urns) - - -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint]: - dbt_source = cast(LookerDashboardSource, pipeline.source) - return dbt_source.get_current_checkpoint( - dbt_source.stale_entity_removal_handler.job_id - ) diff --git a/metadata-ingestion/tests/integration/lookml/test_lookml.py b/metadata-ingestion/tests/integration/lookml/test_lookml.py index 173b0983ed614..5b2d5f094be37 100644 --- a/metadata-ingestion/tests/integration/lookml/test_lookml.py +++ b/metadata-ingestion/tests/integration/lookml/test_lookml.py @@ -1,12 +1,13 @@ import logging import pathlib -from typing import Any, Dict, List, Optional, cast +from typing import Any, Dict, List, cast from unittest import mock import pydantic import pytest from deepdiff import DeepDiff from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from looker_sdk.sdk.api40.models import DBConnection from datahub.configuration.common import PipelineExecutionError @@ -15,10 +16,8 @@ from datahub.ingestion.source.looker.lookml_source import ( LookerModel, LookerRefinementResolver, - LookMLSource, LookMLSourceConfig, ) -from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.metadata.schema_classes import ( DatasetSnapshotClass, @@ -847,15 +846,6 @@ def test_lookml_ingest_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_ assert sorted(deleted_dataset_urns) == sorted(difference_dataset_urns) -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint]: - dbt_source = cast(LookMLSource, pipeline.source) - return dbt_source.get_current_checkpoint( - dbt_source.stale_entity_removal_handler.job_id - ) - - def test_lookml_base_folder(): fake_api = { "base_url": "https://filler.cloud.looker.com", diff --git a/metadata-ingestion/tests/integration/okta/test_okta.py b/metadata-ingestion/tests/integration/okta/test_okta.py index 04f78efacf658..6b65e156f65f6 100644 --- a/metadata-ingestion/tests/integration/okta/test_okta.py +++ b/metadata-ingestion/tests/integration/okta/test_okta.py @@ -1,18 +1,16 @@ import asyncio import pathlib from functools import partial -from typing import Optional, cast from unittest.mock import Mock, patch import jsonpickle import pytest from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from okta.models import Group, User from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.identity.okta import OktaConfig, OktaSource -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState +from datahub.ingestion.source.identity.okta import OktaConfig from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( validate_all_providers_have_committed_successfully, @@ -203,15 +201,6 @@ def test_okta_source_custom_user_name_regex(pytestconfig, mock_datahub_graph, tm ) -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - azure_ad_source = cast(OktaSource, pipeline.source) - return azure_ad_source.get_current_checkpoint( - azure_ad_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) def test_okta_stateful_ingestion(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir: pathlib.Path = pytestconfig.rootpath / "tests/integration/okta" diff --git a/metadata-ingestion/tests/integration/superset/test_superset.py b/metadata-ingestion/tests/integration/superset/test_superset.py index 656819dbcd73d..b04ff1f7d869f 100644 --- a/metadata-ingestion/tests/integration/superset/test_superset.py +++ b/metadata-ingestion/tests/integration/superset/test_superset.py @@ -1,13 +1,11 @@ -from typing import Any, Dict, Optional, cast +from typing import Any, Dict from unittest.mock import patch import pytest from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.run.pipeline import Pipeline -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState -from datahub.ingestion.source.superset import SupersetSource from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import ( run_and_get_pipeline, @@ -19,15 +17,6 @@ GMS_SERVER = f"http://localhost:{GMS_PORT}" -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - superset_source = cast(SupersetSource, pipeline.source) - return superset_source.get_current_checkpoint( - superset_source.stale_entity_removal_handler.job_id - ) - - def register_mock_api(request_mock: Any, override_data: dict = {}) -> None: api_vs_response = { "mock://mock-domain.superset.com/api/v1/security/login": { diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 496a7b7486a28..a6ddaf65a9d68 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -2,11 +2,12 @@ import logging import pathlib import sys -from typing import Optional, cast +from typing import cast from unittest import mock import pytest from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from requests.adapters import ConnectionError from tableauserverclient.models import ( DatasourceItem, @@ -17,8 +18,6 @@ from datahub.configuration.source_common import DEFAULT_ENV from datahub.ingestion.run.pipeline import Pipeline, PipelineContext -from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState from datahub.ingestion.source.tableau import TableauConfig, TableauSource from datahub.ingestion.source.tableau_common import ( TableauLineageOverrides, @@ -253,15 +252,6 @@ def tableau_ingest_common( return pipeline -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint[GenericCheckpointState]]: - tableau_source = cast(TableauSource, pipeline.source) - return tableau_source.get_current_checkpoint( - tableau_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) @pytest.mark.integration def test_tableau_ingest(pytestconfig, tmp_path, mock_datahub_graph): diff --git a/metadata-ingestion/tests/unit/test_glue_source.py b/metadata-ingestion/tests/unit/test_glue_source.py index 294bea9cdf9c8..b083c331c4448 100644 --- a/metadata-ingestion/tests/unit/test_glue_source.py +++ b/metadata-ingestion/tests/unit/test_glue_source.py @@ -7,13 +7,12 @@ import pytest from botocore.stub import Stubber from freezegun import freeze_time +from integration.integration_helpers import get_current_checkpoint_from_pipeline from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields -from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.sink.file import write_metadata_file from datahub.ingestion.source.aws.glue import GlueSource, GlueSourceConfig -from datahub.ingestion.source.state.checkpoint import Checkpoint from datahub.ingestion.source.state.sql_common_state import ( BaseSQLAlchemyCheckpointState, ) @@ -240,15 +239,6 @@ def test_config_without_platform(): assert source.platform == "glue" -def get_current_checkpoint_from_pipeline( - pipeline: Pipeline, -) -> Optional[Checkpoint]: - glue_source = cast(GlueSource, pipeline.source) - return glue_source.get_current_checkpoint( - glue_source.stale_entity_removal_handler.job_id - ) - - @freeze_time(FROZEN_TIME) def test_glue_stateful(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir = pytestconfig.rootpath / "tests/unit/glue" diff --git a/metadata-ingestion/tests/unit/test_source.py b/metadata-ingestion/tests/unit/test_source.py index d337cc8b46447..fef6e8b379969 100644 --- a/metadata-ingestion/tests/unit/test_source.py +++ b/metadata-ingestion/tests/unit/test_source.py @@ -9,7 +9,7 @@ class FakeSource(Source): - def get_workunits(self) -> Iterable[MetadataWorkUnit]: + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return [ MetadataWorkUnit( id="test-workunit", From 358be3e24b271362c36ae1040bde45287b94ee6b Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 23 May 2023 11:30:42 -0700 Subject: [PATCH 2/6] pr feedback --- .../src/datahub/ingestion/api/source.py | 3 ++- .../src/datahub/ingestion/source/aws/glue.py | 2 +- .../ingestion/source/bigquery_v2/bigquery.py | 15 ++------------- .../datahub/ingestion/source/dbt/dbt_common.py | 15 ++------------- .../src/datahub/ingestion/source/file.py | 5 +++-- .../datahub/ingestion/source/gcs/gcs_source.py | 4 ++-- .../datahub/ingestion/source/iceberg/iceberg.py | 4 ++-- .../datahub/ingestion/source/identity/azure_ad.py | 4 ++-- .../src/datahub/ingestion/source/identity/okta.py | 4 ++-- .../src/datahub/ingestion/source/kafka.py | 4 ++-- .../src/datahub/ingestion/source/kafka_connect.py | 4 ++-- .../src/datahub/ingestion/source/ldap.py | 4 ++-- .../ingestion/source/looker/looker_source.py | 2 +- .../ingestion/source/looker/lookml_source.py | 3 +-- .../datahub/ingestion/source/powerbi/powerbi.py | 4 ++-- .../src/datahub/ingestion/source/pulsar.py | 4 ++-- .../datahub/ingestion/source/redshift/redshift.py | 4 ++-- .../src/datahub/ingestion/source/s3/source.py | 4 ++-- .../ingestion/source/schema/json_schema.py | 4 ++-- .../ingestion/source/snowflake/snowflake_v2.py | 4 ++-- .../datahub/ingestion/source/sql/sql_common.py | 3 +-- .../src/datahub/ingestion/source/superset.py | 4 ++-- .../src/datahub/ingestion/source/tableau.py | 4 ++-- .../src/datahub/ingestion/source/unity/source.py | 4 ++-- 24 files changed, 45 insertions(+), 67 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 06dce219da34d..14d9fbfd2166f 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -9,6 +9,7 @@ Dict, Generic, Iterable, + List, Optional, Sequence, Set, @@ -176,7 +177,7 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": # can't make this method abstract. raise NotImplementedError('sources must implement "create"') - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: """A list of functions that transforms the workunits produced by this source. Run in order, first in list is applied first. Be careful with order when overriding. """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index db0b51a10fce7..e40340246f38b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -912,7 +912,7 @@ def _get_domain_wu( domain_urn=domain_urn, ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 243e5a089fc35..725a73a5e6fce 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -5,18 +5,7 @@ import traceback from collections import defaultdict from datetime import datetime, timedelta, timezone -from typing import ( - Dict, - Iterable, - List, - Optional, - Sequence, - Set, - Tuple, - Type, - Union, - cast, -) +from typing import Dict, Iterable, List, Optional, Set, Tuple, Type, Union, cast from google.cloud import bigquery from google.cloud.bigquery.table import TableListItem @@ -485,7 +474,7 @@ def gen_dataset_containers( tags=tags_joined, ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 26781e8c0a431..9243a0f291858 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -5,18 +5,7 @@ from dataclasses import dataclass, field from datetime import datetime from enum import auto -from typing import ( - Any, - Callable, - ClassVar, - Dict, - Iterable, - List, - Optional, - Sequence, - Tuple, - Union, -) +from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Tuple, Union import pydantic from pydantic import root_validator, validator @@ -879,7 +868,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # return dbt nodes + global custom properties raise NotImplementedError() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index 2829bd5cd7bdb..0ba98061b7ed1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -8,7 +8,7 @@ from enum import auto from functools import partial from io import BufferedReader -from typing import Any, Dict, Iterable, Iterator, Optional, Sequence, Tuple, Union +from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union from urllib import parse import ijson @@ -207,7 +207,8 @@ def get_filenames(self) -> Iterable[str]: self.report.total_num_files = 1 return [str(self.config.path)] - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + # No super() call, as we don't want helpers that create / remove workunits return [partial(auto_workunit_reporter, self.report)] def get_workunits_internal( diff --git a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py index 99adc9d38e7ad..251e6e1e3bc5b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/gcs/gcs_source.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List, Optional, Sequence +from typing import Dict, Iterable, List, Optional from urllib.parse import unquote from pydantic import Field, SecretStr, validator @@ -170,7 +170,7 @@ def s3_source_overrides(self, source: S3Source) -> S3Source: return source - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py index eaede94774679..8f36f35c21a5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py +++ b/metadata-ingestion/src/datahub/ingestion/source/iceberg/iceberg.py @@ -1,7 +1,7 @@ import json import logging import uuid -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple from iceberg.api import types as IcebergTypes from iceberg.api.table import Table @@ -119,7 +119,7 @@ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "IcebergSource": config = IcebergSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py index fb0acc0a0e270..5783304da5ef2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/azure_ad.py @@ -4,7 +4,7 @@ import urllib from collections import defaultdict from dataclasses import dataclass, field -from typing import Any, Dict, Generator, Iterable, List, Optional, Sequence +from typing import Any, Dict, Generator, Iterable, List, Optional import click import requests @@ -295,7 +295,7 @@ def get_token(self): click.echo("Error: Token response invalid") exit() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py index a5ba46be30800..78e048c63030f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py +++ b/metadata-ingestion/src/datahub/ingestion/source/identity/okta.py @@ -5,7 +5,7 @@ from collections import defaultdict from dataclasses import dataclass, field from time import sleep -from typing import Dict, Iterable, List, Optional, Sequence, Union +from typing import Dict, Iterable, List, Optional, Union from okta.client import Client as OktaClient from okta.exceptions import OktaAPIException @@ -288,7 +288,7 @@ def __init__(self, config: OktaConfig, ctx: PipelineContext): self.report = OktaSourceReport() self.okta_client = self._create_okta_client() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index fef9fff16e11e..420a4aa010790 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -3,7 +3,7 @@ import logging from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, Iterable, List, Optional, Sequence, Type +from typing import Any, Dict, Iterable, List, Optional, Type import confluent_kafka import confluent_kafka.admin @@ -185,7 +185,7 @@ def create(cls, config_dict: Dict, ctx: PipelineContext) -> "KafkaSource": config: KafkaSourceConfig = KafkaSourceConfig.parse_obj(config_dict) return cls(config, ctx) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index c52bbf0f11ce9..bdf1a5861dd5c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -2,7 +2,7 @@ import re import sys from dataclasses import dataclass, field -from typing import Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Dict, Iterable, List, Optional, Tuple import jpype import jpype.imports @@ -1143,7 +1143,7 @@ def construct_job_workunits( ), ).as_workunit() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py index 4065ae286df3c..ef35526792d6c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py @@ -1,6 +1,6 @@ """LDAP Source""" import dataclasses -from typing import Any, Dict, Iterable, List, Optional, Sequence +from typing import Any, Dict, Iterable, List, Optional import ldap from ldap.controls import SimplePagedResultsControl @@ -216,7 +216,7 @@ def create(cls, config_dict: Dict[str, Any], ctx: PipelineContext) -> "LDAPSourc config = LDAPSourceConfig.parse_obj(config_dict) return cls(ctx, config) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index b3cb0a2e09dd8..d74f082cd8d2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -1171,7 +1171,7 @@ def extract_usage_stat( return mcps - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index f2b936de78cd4..4f0f82a86bc9d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -14,7 +14,6 @@ Iterable, List, Optional, - Sequence, Set, Tuple, Type, @@ -1782,7 +1781,7 @@ def get_manifest_if_present(self, folder: pathlib.Path) -> Optional[LookerManife else: return None - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 22968db3c5120..a8c69b0c6b211 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -4,7 +4,7 @@ # ######################################################### import logging -from typing import Iterable, List, Optional, Sequence, Set, Tuple, Union +from typing import Iterable, List, Optional, Set, Tuple, Union import datahub.emitter.mce_builder as builder import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes @@ -1185,7 +1185,7 @@ def get_workspace_workunit( for workunit in dataset_workunits: yield workunit - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: # As modified_workspaces is not idempotent, hence workunit processors are run later for each workspace_id # This will result in creating checkpoint for each workspace_id if self.source_config.modified_since: diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index 306e22cd5be5c..512e8858558c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass from hashlib import md5 -from typing import Iterable, List, Optional, Sequence, Tuple +from typing import Iterable, List, Optional, Tuple import requests @@ -222,7 +222,7 @@ def create(cls, config_dict, ctx): return cls(config, ctx) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 6d53c51daedd7..79d6dcc65d82a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -1,6 +1,6 @@ import logging from collections import defaultdict -from typing import Dict, Iterable, List, Optional, Sequence, Type, Union +from typing import Dict, Iterable, List, Optional, Type, Union import humanfriendly @@ -351,7 +351,7 @@ def gen_database_container(self, database: str) -> Iterable[MetadataWorkUnit]: sub_types=[DatasetContainerSubTypes.DATABASE], ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 4f4d2eeda2831..6b961f7c6b0da 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -6,7 +6,7 @@ import time from collections import OrderedDict from datetime import datetime -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple import pydeequ from more_itertools import peekable @@ -859,7 +859,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: }, ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py index 3e4ebef4ce8fa..2ac946b23deb0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema/json_schema.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from os.path import basename, dirname from pathlib import Path -from typing import Any, Iterable, Optional, Sequence, Union +from typing import Any, Iterable, List, Optional, Union import jsonref from pydantic import AnyHttpUrl, DirectoryPath, FilePath, validator @@ -318,7 +318,7 @@ def _load_one_file( ), ).as_workunit() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 234e8bcf86398..4101d9ad69fed 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,7 +4,7 @@ import os.path import platform from dataclasses import dataclass -from typing import Callable, Dict, Iterable, List, Optional, Sequence, Union, cast +from typing import Callable, Dict, Iterable, List, Optional, Union, cast import pandas as pd from snowflake.connector import SnowflakeConnection @@ -465,7 +465,7 @@ def query(query): return _report - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 8e5f61242a801..288b4bf1e78d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -11,7 +11,6 @@ Iterable, List, Optional, - Sequence, Set, Tuple, Type, @@ -459,7 +458,7 @@ def add_table_to_schema_container( parent_container_key=schema_container_key, ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index bad995ef322bd..a6af6d62df9cc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -1,7 +1,7 @@ import json import logging from functools import lru_cache -from typing import Dict, Iterable, Optional, Sequence +from typing import Dict, Iterable, List, Optional import dateutil.parser as dp import requests @@ -401,7 +401,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: yield from self.emit_dashboard_mces() yield from self.emit_chart_mces() - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 07defbd7565bf..2f4def4e0b117 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -4,7 +4,7 @@ from dataclasses import dataclass from datetime import datetime from functools import lru_cache -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple, Union, cast +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast import dateutil.parser as dp import tableauserverclient as TSC @@ -2265,7 +2265,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: sub_types=[tableau_constant.PROJECT], ) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index b2cbfe721c737..a0afaddec50a5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -2,7 +2,7 @@ import re import time from datetime import timedelta -from typing import Dict, Iterable, List, Optional, Sequence, Set +from typing import Dict, Iterable, List, Optional, Set from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -197,7 +197,7 @@ def create(cls, config_dict, ctx): config = UnityCatalogSourceConfig.parse_obj(config_dict) return cls(ctx=ctx, config=config) - def get_workunit_processors(self) -> Sequence[Optional[MetadataWorkUnitProcessor]]: + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), StaleEntityRemovalHandler.create( From 6428e772066c66990a7b5b9a73f3233b108917c1 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 23 May 2023 17:35:43 -0700 Subject: [PATCH 3/6] remove cyclical dependencies --- .../src/datahub/ingestion/api/source_helpers.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 6e27f1f95399c..ecd89bfc844f4 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -1,11 +1,7 @@ -from typing import Callable, Iterable, Optional, Set, TypeVar, Union +from typing import TYPE_CHECKING, Callable, Iterable, Optional, Set, TypeVar, Union from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.state.stale_entity_removal_handler import ( - StaleEntityRemovalHandler, -) from datahub.metadata.schema_classes import ( MetadataChangeEventClass, MetadataChangeProposalClass, @@ -16,6 +12,12 @@ from datahub.utilities.urns.urn import guess_entity_type from datahub.utilities.urns.urn_iter import list_urns +if TYPE_CHECKING: + from datahub.ingestion.api.source import SourceReport + from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, + ) + def auto_workunit( stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]] @@ -77,7 +79,7 @@ def _default_entity_type_fn(wu: MetadataWorkUnit) -> Optional[str]: def auto_stale_entity_removal( - stale_entity_removal_handler: StaleEntityRemovalHandler, + stale_entity_removal_handler: "StaleEntityRemovalHandler", stream: Iterable[MetadataWorkUnit], entity_type_fn: Callable[ [MetadataWorkUnit], Optional[str] @@ -106,7 +108,7 @@ def auto_stale_entity_removal( T = TypeVar("T", bound=MetadataWorkUnit) -def auto_workunit_reporter(report: SourceReport, stream: Iterable[T]) -> Iterable[T]: +def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Iterable[T]: """ Calls report.report_workunit() on each workunit. """ From 2c0d113e94cbabc6b90cddbf92dc46462cbd2f4d Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 23 May 2023 17:44:17 -0700 Subject: [PATCH 4/6] fix tests --- .../src/datahub/ingestion/source/aws/glue.py | 9 -------- .../tests/test_helpers/state_helpers.py | 5 +++-- .../tests/unit/glue/glue_mces_golden.json | 22 +++++++++++++++++++ .../glue_mces_platform_instance_golden.json | 22 +++++++++++++++++++ 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index e40340246f38b..9a07edc4c0382 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -10,7 +10,6 @@ List, Mapping, Optional, - Sequence, Set, Tuple, Union, @@ -50,11 +49,6 @@ support_status, ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor -from datahub.ingestion.api.source_helpers import ( - auto_stale_entity_removal, - auto_status_aspect, - auto_workunit_reporter, -) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws import s3_util from datahub.ingestion.source.aws.aws_common import AwsSourceConfig @@ -64,9 +58,6 @@ DatasetSubTypes, ) from datahub.ingestion.source.glue_profiling_config import GlueProfilingConfig -from datahub.ingestion.source.state.sql_common_state import ( - BaseSQLAlchemyCheckpointState, -) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, diff --git a/metadata-ingestion/tests/test_helpers/state_helpers.py b/metadata-ingestion/tests/test_helpers/state_helpers.py index c2d1fa3d50220..76f2ab283790f 100644 --- a/metadata-ingestion/tests/test_helpers/state_helpers.py +++ b/metadata-ingestion/tests/test_helpers/state_helpers.py @@ -107,7 +107,8 @@ def get_current_checkpoint_from_pipeline( # TODO: This only works for stale entity removal. We need to generalize this. stateful_source = cast(StatefulIngestionSourceBase, pipeline.source) - stale_entity_removal_handler: StaleEntityRemovalHandler = stateful_source.stale_entity_removal_handler # type: ignore return stateful_source.state_provider.get_current_checkpoint( - stale_entity_removal_handler.job_id + StaleEntityRemovalHandler.compute_job_id( + getattr(stateful_source, "platform", "default") + ) ) diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json index 97d75a9a26df4..a0a4ab00a429e 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_golden.json @@ -1253,5 +1253,27 @@ "removed": false } } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:baz:bob", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "baz:bob" + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:foo:bar", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "foo:bar" + } + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json index 5fc2a6f9e59e5..6a5f47ef67cf5 100644 --- a/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json +++ b/metadata-ingestion/tests/unit/glue/glue_mces_platform_instance_golden.json @@ -1260,5 +1260,27 @@ "removed": false } } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:baz:bob", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "baz:bob" + } + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:foo:bar", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "foo:bar" + } + } } ] \ No newline at end of file From 4a61ef9b51883080a6db6893989a03d95d877a62 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Tue, 23 May 2023 19:30:30 -0700 Subject: [PATCH 5/6] update goldens --- .../integration/lookml/expected_output.json | 45 ++ .../lookml_mces_golden_deleted_stateful.json | 65 ++- .../refinement_include_order_golden.json | 30 ++ .../lookml/refinements_ingestion_golden.json | 45 ++ ..._config_and_modified_since_admin_only.json | 15 + ...bleau_extract_all_project_mces_golden.json | 210 ++++++++++ .../tableau/tableau_mces_golden.json | 210 ++++++++++ .../tableau_mces_golden_deleted_stateful.json | 388 ++++++++++++++---- .../tableau_nested_project_mces_golden.json | 210 ++++++++++ .../tableau_signout_timeout_mces_golden.json | 210 ++++++++++ ...au_with_platform_instance_mces_golden.json | 210 ++++++++++ 11 files changed, 1539 insertions(+), 99 deletions(-) diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index 10f40333eed92..bc5cb11fd15ff 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -1790,5 +1790,50 @@ "lastObserved": 1586847600000, "runId": "lookml-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Measure" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Temporal", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Temporal" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json index b119439c1799d..434310b13ef05 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_golden_deleted_stateful.json @@ -387,9 +387,54 @@ "runId": "lookml-test" } }, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Measure" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Temporal", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Temporal" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -419,7 +464,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -434,7 +479,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -449,7 +494,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.test_include_external_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -464,7 +509,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.extending_looker_events,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -479,7 +524,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.fragment_derived_view,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -494,7 +539,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.autodetect_sql_name_based_on_view_name,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -509,7 +554,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.flights,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -524,7 +569,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.my_derived_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.ability,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -539,7 +584,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.include_able_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.view_derived_explore,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json index 95b785a6557ad..875b07647b166 100644 --- a/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinement_include_order_golden.json @@ -702,5 +702,35 @@ "lastObserved": 1586847600000, "runId": "lookml-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Measure" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index 7fe05ee5fcdc3..7907a7d1122ff 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -1822,5 +1822,50 @@ "lastObserved": 1586847600000, "runId": "lookml-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Dimension", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Dimension" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Measure", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Measure" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Temporal", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Temporal" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json index a69c978b1df71..1f0084380dfc8 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json @@ -1114,5 +1114,20 @@ "lastObserved": 1643871600000, "runId": "powerbi-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Certified", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Certified" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/tableau_extract_all_project_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_extract_all_project_mces_golden.json index 53f507e115fa5..a4c8733f5b26b 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_extract_all_project_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_extract_all_project_mces_golden.json @@ -42591,5 +42591,215 @@ "lastObserved": 1638860400000, "runId": "tableau-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "ATTRIBUTE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "BINFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "CALCULATEDFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COLUMNFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COUNT" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DATASOURCEFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DIMENSION" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "GROUPFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "HIERARCHYFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "MEASURE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SETFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SUM" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "TagSheet3" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "YEAR" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/tableau_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_mces_golden.json index 04d4d9e8e5838..79b277f306fd8 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_mces_golden.json @@ -42403,5 +42403,215 @@ "lastObserved": 1638860400000, "runId": "tableau-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "ATTRIBUTE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "BINFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "CALCULATEDFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COLUMNFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COUNT" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DATASOURCEFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DIMENSION" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "GROUPFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "HIERARCHYFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "MEASURE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SETFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SUM" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "TagSheet3" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "YEAR" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/tableau_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/tableau/tableau_mces_golden_deleted_stateful.json index 3785b2572d7e4..1e9da1d2a3736 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_mces_golden_deleted_stateful.json @@ -199,7 +199,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sys_user_group,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,22b0b4c3-6b85-713d-a161-5a87fdd78f40,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -214,7 +214,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,618c87db-5959-338b-bcc7-6f5f4cc0b6c6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,dfe2c02a-54b7-f7a2-39fc-c651da2f6ad8,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -228,8 +228,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,b679da5e-7d03-f01e-b2ea-01fb3c1926dc)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -244,7 +244,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,c57a5574-db47-46df-677f-0b708dab14db)", + "entityUrn": "urn:li:chart:(tableau,53b8dc2f-8ada-51f7-7422-fe82e9b803cc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -259,7 +259,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,373c6466-bb0c-b319-8752-632456349261)", + "entityUrn": "urn:li:chart:(tableau,38130558-4194-2e2a-3046-c0d887829cb4)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -274,7 +274,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.actor,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -289,7 +289,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.orders,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,d8d4c0ea-3162-fa11-31e6-26675da44a38,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -303,8 +303,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,4644ccb1-2adc-cf26-c654-04ed1dcc7090,PROD)", + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -318,8 +318,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.customer,PROD)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(tableau,5dcaaf46-e6fb-2548-e763-272a7ab2c9b1)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -334,7 +334,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,e70a540d-55ed-b9cc-5a3c-01ebe81a1274)", + "entityUrn": "urn:li:chart:(tableau,b207c2f2-b675-32e3-2663-17bb836a018b)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -349,7 +349,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity11,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,00cce29f-b561-bb41-3557-8e19660bb5dd,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -364,7 +364,22 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,6cbbeeb2-9f3a-00f6-2342-17139d6e97ae,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,06c3e060-8133-4b58-9b53-a0fced25e056,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.address,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -379,7 +394,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,38130558-4194-2e2a-3046-c0d887829cb4)", + "entityUrn": "urn:li:chart:(tableau,7ef184c1-5a41-5ec8-723e-ae44c20aa335)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -393,8 +408,23 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,801c95e3-b07e-7bfe-3789-a561c7beccd3,PROD)", + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,7fbc77ba-0ab6-3727-0db3-d8402a804da5)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -409,7 +439,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.staff,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -424,7 +454,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_request,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.problem,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -439,7 +469,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.task,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sys_user_group,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -453,8 +483,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,20fc5eb7-81eb-aa18-8c39-af501c62d085)", + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -469,7 +499,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,d8d4c0ea-3162-fa11-31e6-26675da44a38,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.campaignstable,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -483,8 +513,23 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,d00f4ba6-707e-4684-20af-69eb47587cc2,PROD)", + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -499,7 +544,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,00cce29f-b561-bb41-3557-8e19660bb5dd,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.incident,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -514,7 +559,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,222d1406-de0e-cd8d-0b94-9b45a0007e59)", + "entityUrn": "urn:li:chart:(tableau,f76d3570-23b8-f74b-d85c-cc5484c2079c)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -529,7 +574,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,58af9ecf-b839-da50-65e1-2e1fa20e3362)", + "entityUrn": "urn:li:chart:(tableau,222d1406-de0e-cd8d-0b94-9b45a0007e59)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -543,8 +588,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:047691e9c16bec8fb08e1df0f5d71c4d", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -559,7 +604,52 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.campaignstable,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_cat_item,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.payment,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.customer,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -574,7 +664,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,2b5351c1-535d-4a4a-1339-c51ddd6abf8a)", + "entityUrn": "urn:li:chart:(tableau,e604255e-0573-3951-6db7-05bee48116c1)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -589,7 +679,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,53b8dc2f-8ada-51f7-7422-fe82e9b803cc)", + "entityUrn": "urn:li:chart:(tableau,721c3c41-7a2b-16a8-3281-6f948a44be96)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -603,8 +693,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.problem,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,2b73b9dd-4ec7-75ca-f2e9-fa1984ca8b72)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -619,7 +709,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,e604255e-0573-3951-6db7-05bee48116c1)", + "entityUrn": "urn:li:chart:(tableau,373c6466-bb0c-b319-8752-632456349261)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -633,8 +723,38 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity7,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:047691e9c16bec8fb08e1df0f5d71c4d", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(tableau,20e44c22-1ccd-301a-220c-7b6837d09a52)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -663,8 +783,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,8385ea9a-0749-754f-7ad9-824433de2120)", + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -678,8 +798,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,618b3e76-75c1-cb31-0c61-3f4890b72c31)", + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -694,7 +814,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_cat_item,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_req_item,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -708,8 +828,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,721c3c41-7a2b-16a8-3281-6f948a44be96)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity11,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -723,8 +843,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,2b73b9dd-4ec7-75ca-f2e9-fa1984ca8b72)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(tableau,8f7dd564-36b6-593f-3c6f-687ad06cd40b)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -739,7 +859,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,7ef184c1-5a41-5ec8-723e-ae44c20aa335)", + "entityUrn": "urn:li:chart:(tableau,c57a5574-db47-46df-677f-0b708dab14db)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -753,8 +873,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:fad3de4b86519c3edeb685215fe0bab1", + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,c14973c2-e1c3-563a-a9c1-8a408396d22a)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -769,7 +889,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,7fbc77ba-0ab6-3727-0db3-d8402a804da5)", + "entityUrn": "urn:li:chart:(tableau,8385ea9a-0749-754f-7ad9-824433de2120)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -783,8 +903,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:94e6e84b66f9ee8c70c22f06cfbad6a9", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.task,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -798,8 +918,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,06c3e060-8133-4b58-9b53-a0fced25e056,PROD)", + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -813,8 +933,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(tableau,8f7dd564-36b6-593f-3c6f-687ad06cd40b)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,f4317efd-c3e6-6ace-8fe6-e71b590bbbcc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -829,7 +949,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:external,sample - superstore%2C %28new%29.xls.people,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,3ade7817-ae27-259e-8e48-1570e7f932f6,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -844,7 +964,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.staff,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,6cbbeeb2-9f3a-00f6-2342-17139d6e97ae,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -859,7 +979,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,c14973c2-e1c3-563a-a9c1-8a408396d22a)", + "entityUrn": "urn:li:chart:(tableau,692a2da4-2a82-32c1-f713-63b8e4325d86)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -873,8 +993,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,692a2da4-2a82-32c1-f713-63b8e4325d86)", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,4644ccb1-2adc-cf26-c654-04ed1dcc7090,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -888,8 +1008,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.incident,PROD)", + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -904,7 +1024,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,dfe2c02a-54b7-f7a2-39fc-c651da2f6ad8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,801c95e3-b07e-7bfe-3789-a561c7beccd3,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -918,8 +1038,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,22b0b4c3-6b85-713d-a161-5a87fdd78f40,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,20fc5eb7-81eb-aa18-8c39-af501c62d085)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -933,8 +1053,23 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(tableau,5dcaaf46-e6fb-2548-e763-272a7ab2c9b1)", + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -949,7 +1084,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,b207c2f2-b675-32e3-2663-17bb836a018b)", + "entityUrn": "urn:li:chart:(tableau,8a6a269a-d6de-fae4-5050-513255b40ffc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -963,8 +1098,23 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.address,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:94e6e84b66f9ee8c70c22f06cfbad6a9", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -979,7 +1129,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,4fb670d5-3e19-9656-e684-74aa9729cf18,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity7,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -993,8 +1143,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,8a6a269a-d6de-fae4-5050-513255b40ffc)", + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1009,7 +1159,22 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.actor,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,d00f4ba6-707e-4684-20af-69eb47587cc2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity6,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1024,7 +1189,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,f76d3570-23b8-f74b-d85c-cc5484c2079c)", + "entityUrn": "urn:li:chart:(tableau,58af9ecf-b839-da50-65e1-2e1fa20e3362)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1039,7 +1204,22 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,130496dc-29ca-8a89-e32b-d73c4d8b65ff)", + "entityUrn": "urn:li:chart:(tableau,618b3e76-75c1-cb31-0c61-3f4890b72c31)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,e70a540d-55ed-b9cc-5a3c-01ebe81a1274)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1054,7 +1234,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,demo_postgres_instance.dvdrental.public.payment,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_request,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1069,7 +1249,7 @@ }, { "entityType": "chart", - "entityUrn": "urn:li:chart:(tableau,f4317efd-c3e6-6ace-8fe6-e71b590bbbcc)", + "entityUrn": "urn:li:chart:(tableau,2b5351c1-535d-4a4a-1339-c51ddd6abf8a)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1083,8 +1263,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,618c87db-5959-338b-bcc7-6f5f4cc0b6c6,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1098,8 +1278,8 @@ } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.cmdb_ci,PROD)", + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,b679da5e-7d03-f01e-b2ea-01fb3c1926dc)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1114,7 +1294,22 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.sc_req_item,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:marketo-marketo,marketo.activity10,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(tableau,130496dc-29ca-8a89-e32b-d73c4d8b65ff)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1143,8 +1338,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(tableau,20e44c22-1ccd-301a-220c-7b6837d09a52)", + "entityType": "container", + "entityUrn": "urn:li:container:fad3de4b86519c3edeb685215fe0bab1", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1159,7 +1354,22 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,3ade7817-ae27-259e-8e48-1570e7f932f6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:tableau,4fb670d5-3e19-9656-e684-74aa9729cf18,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": true + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:webdata-direct:servicenowitsm-servicenowitsm,ven01911.cmdb_ci,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/tableau/tableau_nested_project_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_nested_project_mces_golden.json index 53f507e115fa5..a4c8733f5b26b 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_nested_project_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_nested_project_mces_golden.json @@ -42591,5 +42591,215 @@ "lastObserved": 1638860400000, "runId": "tableau-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "ATTRIBUTE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "BINFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "CALCULATEDFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COLUMNFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COUNT" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DATASOURCEFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DIMENSION" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "GROUPFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "HIERARCHYFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "MEASURE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SETFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SUM" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "TagSheet3" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "YEAR" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/tableau_signout_timeout_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_signout_timeout_mces_golden.json index 04d4d9e8e5838..79b277f306fd8 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_signout_timeout_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_signout_timeout_mces_golden.json @@ -42403,5 +42403,215 @@ "lastObserved": 1638860400000, "runId": "tableau-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "ATTRIBUTE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "BINFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "CALCULATEDFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COLUMNFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COUNT" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DATASOURCEFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DIMENSION" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "GROUPFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "HIERARCHYFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "MEASURE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SETFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SUM" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "TagSheet3" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "YEAR" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/tableau_with_platform_instance_mces_golden.json b/metadata-ingestion/tests/integration/tableau/tableau_with_platform_instance_mces_golden.json index 6dba8062d3e4f..b0d9e06b4c492 100644 --- a/metadata-ingestion/tests/integration/tableau/tableau_with_platform_instance_mces_golden.json +++ b/metadata-ingestion/tests/integration/tableau/tableau_with_platform_instance_mces_golden.json @@ -42374,5 +42374,215 @@ "lastObserved": 1638860400000, "runId": "tableau-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:ATTRIBUTE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "ATTRIBUTE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:BINFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "BINFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:CALCULATEDFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "CALCULATEDFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COLUMNFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COLUMNFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:COUNT", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "COUNT" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DATASOURCEFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DATASOURCEFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:DIMENSION", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "DIMENSION" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:GROUPFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "GROUPFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:HIERARCHYFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "HIERARCHYFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:MEASURE", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "MEASURE" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SETFIELD", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SETFIELD" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:SUM", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "SUM" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:TagSheet3", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "TagSheet3" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:YEAR", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "YEAR" + } + }, + "systemMetadata": { + "lastObserved": 1638860400000, + "runId": "tableau-test" + } } ] \ No newline at end of file From 50796278f983a9d524366fb7761aad8142e861d2 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Wed, 24 May 2023 00:41:36 -0700 Subject: [PATCH 6/6] fix tests --- .../datahub/ingestion/source/dbt/dbt_common.py | 4 +--- .../powerbi/golden_test_endorsement.json | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 9243a0f291858..76d1f5d5c414b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -871,9 +871,7 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), - StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ).workunit_processor, + self.stale_entity_removal_handler.workunit_processor, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json b/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json index e852ce5eca321..6b99ee55e02eb 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json @@ -1132,5 +1132,20 @@ "lastObserved": 1643871600000, "runId": "powerbi-test" } +}, +{ + "entityType": "tag", + "entityUrn": "urn:li:tag:Promoted", + "changeType": "UPSERT", + "aspectName": "tagKey", + "aspect": { + "json": { + "name": "Promoted" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } } ] \ No newline at end of file