datahub-project · asikowitz · May 24, 2023 · May 23, 2023 · May 23, 2023 · May 23, 2023
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py
@@ -3,7 +3,21 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Dict, Generic, Iterable, Optional, Set, Type, TypeVar, Union, cast
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Generic,
+    Iterable,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    Type,
+    TypeVar,
+    Union,
+    cast,
+)
 
 from pydantic import BaseModel
 
@@ -12,6 +26,11 @@
 from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext, RecordEnvelope, WorkUnit
 from datahub.ingestion.api.report import Report
+from datahub.ingestion.api.source_helpers import (
+    auto_materialize_referenced_tags,
+    auto_status_aspect,
+    auto_workunit_reporter,
+)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent
 from datahub.utilities.lossy_collections import LossyDict, LossyList
@@ -118,6 +137,9 @@ class TestConnectionReport(Report):
 WorkUnitType = TypeVar("WorkUnitType", bound=WorkUnit)
 ExtractorConfig = TypeVar("ExtractorConfig", bound=ConfigModel)
 
+WorkUnitProcessor = Callable[[Iterable[WorkUnitType]], Iterable[WorkUnitType]]
+MetadataWorkUnitProcessor = WorkUnitProcessor[MetadataWorkUnit]
+
 
 class Extractor(Generic[WorkUnitType, ExtractorConfig], Closeable, metaclass=ABCMeta):
     ctx: PipelineContext
@@ -155,9 +177,35 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source":
         # can't make this method abstract.
         raise NotImplementedError('sources must implement "create"')
 
-    @abstractmethod
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        """A list of functions that transforms the workunits produced by this source.
+        Run in order, first in list is applied first. Be careful with order when overriding.
+        """
+        return [
+            auto_status_aspect,
+            auto_materialize_referenced_tags,
+            partial(auto_workunit_reporter, self.get_report()),
+        ]
+
+    @staticmethod
+    def _apply_workunit_processors(
+        workunit_processors: Sequence[Optional[MetadataWorkUnitProcessor]],
+        stream: Iterable[MetadataWorkUnit],
+    ) -> Iterable[MetadataWorkUnit]:
+        for processor in workunit_processors:
+            if processor is not None:
+                stream = processor(stream)
+        return stream
+
     def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        pass
+        return self._apply_workunit_processors(
+            self.get_workunit_processors(), self.get_workunits_internal()
+        )
+
+    def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+        raise NotImplementedError(
+            "get_workunits_internal must be implemented if get_workunits is not overriden."
+        )
 
     @abstractmethod
     def get_report(self) -> SourceReport:

diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
@@ -1,12 +1,7 @@
-from typing import Callable, Iterable, Optional, Set, TypeVar, Union
+from typing import TYPE_CHECKING, Callable, Iterable, Optional, Set, TypeVar, Union
 
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.common import WorkUnit
-from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.state.stale_entity_removal_handler import (
-    StaleEntityRemovalHandler,
-)
 from datahub.metadata.schema_classes import (
     MetadataChangeEventClass,
     MetadataChangeProposalClass,
@@ -17,6 +12,12 @@
 from datahub.utilities.urns.urn import guess_entity_type
 from datahub.utilities.urns.urn_iter import list_urns
 
+if TYPE_CHECKING:
+    from datahub.ingestion.api.source import SourceReport
+    from datahub.ingestion.source.state.stale_entity_removal_handler import (
+        StaleEntityRemovalHandler,
+    )
+
 
 def auto_workunit(
     stream: Iterable[Union[MetadataChangeEventClass, MetadataChangeProposalWrapper]]
@@ -78,7 +79,7 @@ def _default_entity_type_fn(wu: MetadataWorkUnit) -> Optional[str]:
 
 
 def auto_stale_entity_removal(
-    stale_entity_removal_handler: StaleEntityRemovalHandler,
+    stale_entity_removal_handler: "StaleEntityRemovalHandler",
     stream: Iterable[MetadataWorkUnit],
     entity_type_fn: Callable[
         [MetadataWorkUnit], Optional[str]
@@ -104,10 +105,10 @@ def auto_stale_entity_removal(
     yield from stale_entity_removal_handler.gen_removed_entity_workunits()
 
 
-T = TypeVar("T", bound=WorkUnit)
+T = TypeVar("T", bound=MetadataWorkUnit)
 
 
-def auto_workunit_reporter(report: SourceReport, stream: Iterable[T]) -> Iterable[T]:
+def auto_workunit_reporter(report: "SourceReport", stream: Iterable[T]) -> Iterable[T]:
     """
     Calls report.report_workunit() on each workunit.
     """
@@ -119,14 +120,9 @@ def auto_workunit_reporter(report: SourceReport, stream: Iterable[T]) -> Iterabl
 
 def auto_materialize_referenced_tags(
     stream: Iterable[MetadataWorkUnit],
-    active: bool = True,
 ) -> Iterable[MetadataWorkUnit]:
     """For all references to tags, emit a tag key aspect to ensure that the tag exists in our backend."""
 
-    if not active:
-        yield from stream
-        return
-
     referenced_tags = set()
     tags_with_aspects = set()
 

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -48,11 +48,7 @@
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source_helpers import (
-    auto_stale_entity_removal,
-    auto_status_aspect,
-    auto_workunit_reporter,
-)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.aws import s3_util
 from datahub.ingestion.source.aws.aws_common import AwsSourceConfig
@@ -62,9 +58,6 @@
     DatasetSubTypes,
 )
 from datahub.ingestion.source.glue_profiling_config import GlueProfilingConfig
-from datahub.ingestion.source.state.sql_common_state import (
-    BaseSQLAlchemyCheckpointState,
-)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -273,15 +266,6 @@ def __init__(self, config: GlueSourceConfig, ctx: PipelineContext):
         self.extract_transforms = config.extract_transforms
         self.env = config.env
 
-        # Create and register the stateful ingestion use-case handlers.
-        self.stale_entity_removal_handler = StaleEntityRemovalHandler(
-            source=self,
-            config=self.source_config,
-            state_type_class=BaseSQLAlchemyCheckpointState,
-            pipeline_name=self.ctx.pipeline_name,
-            run_id=self.ctx.run_id,
-        )
-
     def get_glue_arn(
         self, account_id: str, database: str, table: Optional[str] = None
     ) -> str:
@@ -919,13 +903,13 @@ def _get_domain_wu(
                 domain_urn=domain_urn,
             )
 
-    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        return auto_stale_entity_removal(
-            self.stale_entity_removal_handler,
-            auto_workunit_reporter(
-                self.report, auto_status_aspect(self.get_workunits_internal())
-            ),
-        )
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.source_config, self.ctx
+            ).workunit_processor,
+        ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         database_seen = set()

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -30,16 +30,11 @@
 )
 from datahub.ingestion.api.source import (
     CapabilityReport,
+    MetadataWorkUnitProcessor,
     SourceCapability,
     TestableSource,
     TestConnectionReport,
 )
-from datahub.ingestion.api.source_helpers import (
-    auto_materialize_referenced_tags,
-    auto_stale_entity_removal,
-    auto_status_aspect,
-    auto_workunit_reporter,
-)
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.bigquery_v2.bigquery_audit import (
     BigqueryTableIdentifier,
@@ -80,9 +75,6 @@
 from datahub.ingestion.source.state.redundant_run_skip_handler import (
     RedundantRunSkipHandler,
 )
-from datahub.ingestion.source.state.sql_common_state import (
-    BaseSQLAlchemyCheckpointState,
-)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
 )
@@ -228,15 +220,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
         self.lineage_extractor = BigqueryLineageExtractor(config, self.report)
         self.usage_extractor = BigQueryUsageExtractor(config, self.report)
 
-        # Create and register the stateful ingestion use-case handler.
-        self.stale_entity_removal_handler = StaleEntityRemovalHandler(
-            source=self,
-            config=self.config,
-            state_type_class=BaseSQLAlchemyCheckpointState,
-            pipeline_name=self.ctx.pipeline_name,
-            run_id=self.ctx.run_id,
-        )
-
         self.domain_registry: Optional[DomainRegistry] = None
         if self.config.domain:
             self.domain_registry = DomainRegistry(
@@ -491,6 +474,14 @@ def gen_dataset_containers(
             tags=tags_joined,
         )
 
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
+        ]
+
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         conn: bigquery.Client = get_bigquery_client(self.config)
         self.add_config_to_report()
@@ -514,17 +505,6 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 self.report.set_ingestion_stage(project.id, "Lineage Extraction")
                 yield from self.generate_lineage(project.id)
 
-    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        return auto_materialize_referenced_tags(
-            auto_stale_entity_removal(
-                self.stale_entity_removal_handler,
-                auto_workunit_reporter(
-                    self.report,
-                    auto_status_aspect(self.get_workunits_internal()),
-                ),
-            )
-        )
-
     def _should_ingest_usage(self) -> bool:
         if not self.config.include_usage_statistics:
             return False

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -31,12 +31,7 @@
     platform_name,
     support_status,
 )
-from datahub.ingestion.api.source_helpers import (
-    auto_materialize_referenced_tags,
-    auto_stale_entity_removal,
-    auto_status_aspect,
-    auto_workunit_reporter,
-)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.common.subtypes import DatasetSubTypes
 from datahub.ingestion.source.sql.sql_types import (
@@ -50,7 +45,6 @@
     resolve_trino_modified_type,
     resolve_vertica_modified_type,
 )
-from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
     StaleEntityRemovalSourceReport,
@@ -688,12 +682,8 @@ def __init__(self, config: DBTCommonConfig, ctx: PipelineContext, platform: str)
                 self.config.owner_extraction_pattern
             )
         # Create and register the stateful ingestion use-case handler.
-        self.stale_entity_removal_handler = StaleEntityRemovalHandler(
-            source=self,
-            config=self.config,
-            state_type_class=GenericCheckpointState,
-            pipeline_name=self.ctx.pipeline_name,
-            run_id=self.ctx.run_id,
+        self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
+            self, self.config, ctx
         )
 
     def create_test_entity_mcps(
@@ -878,15 +868,11 @@ def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]:
         # return dbt nodes + global custom properties
         raise NotImplementedError()
 
-    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        return auto_materialize_referenced_tags(
-            auto_stale_entity_removal(
-                self.stale_entity_removal_handler,
-                auto_workunit_reporter(
-                    self.report, auto_status_aspect(self.get_workunits_internal())
-                ),
-            )
-        )
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        return [
+            *super().get_workunit_processors(),
+            self.stale_entity_removal_handler.workunit_processor,
+        ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         if self.config.write_semantics == "PATCH" and not self.ctx.graph:

diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py
@@ -6,8 +6,9 @@
 from collections import defaultdict
 from dataclasses import dataclass, field
 from enum import auto
+from functools import partial
 from io import BufferedReader
-from typing import Any, Dict, Iterable, Iterator, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Tuple, Union
 from urllib import parse
 
 import ijson
@@ -28,6 +29,7 @@
 )
 from datahub.ingestion.api.source import (
     CapabilityReport,
+    MetadataWorkUnitProcessor,
     SourceReport,
     TestableSource,
     TestConnectionReport,
@@ -205,8 +207,9 @@ def get_filenames(self) -> Iterable[str]:
             self.report.total_num_files = 1
             return [str(self.config.path)]
 
-    def get_workunits(self) -> Iterable[MetadataWorkUnit]:
-        return auto_workunit_reporter(self.report, self.get_workunits_internal())
+    def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+        # No super() call, as we don't want helpers that create / remove workunits
+        return [partial(auto_workunit_reporter, self.report)]
 
     def get_workunits_internal(
         self,