Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest): start simplifying stateful ingestion state #6740

Merged
merged 1 commit into from
Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -713,11 +713,9 @@ def get_last_checkpoint(
if last_checkpoint is not None and is_conversion_required:
# Map the BaseSQLAlchemyCheckpointState to DbtCheckpointState
dbt_checkpoint_state: DbtCheckpointState = DbtCheckpointState()
dbt_checkpoint_state.encoded_node_urns = (
dbt_checkpoint_state.urns = (
cast(BaseSQLAlchemyCheckpointState, last_checkpoint.state)
).encoded_table_urns
# Old dbt source was not supporting the assertion
dbt_checkpoint_state.encoded_assertion_urns = []
).urns
last_checkpoint.state = dbt_checkpoint_state

return last_checkpoint
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
LookerAPI,
LookerAPIConfig,
)
from datahub.ingestion.source.state.looker_state import LookerCheckpointState
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler,
StatefulStaleMetadataRemovalConfig,
Expand Down Expand Up @@ -234,7 +234,7 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext):
self.stale_entity_removal_handler = StaleEntityRemovalHandler(
source=self,
config=self.source_config,
state_type_class=LookerCheckpointState,
state_type_class=GenericCheckpointState,
pipeline_name=self.ctx.pipeline_name,
run_id=self.ctx.run_id,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
LookerAPIConfig,
TransportOptionsConfig,
)
from datahub.ingestion.source.state.lookml_state import LookMLCheckpointState
from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalHandler,
StaleEntityRemovalSourceReport,
Expand Down Expand Up @@ -1089,7 +1089,7 @@ def __init__(self, config: LookMLSourceConfig, ctx: PipelineContext):
self.stale_entity_removal_handler = StaleEntityRemovalHandler(
source=self,
config=self.source_config,
state_type_class=LookMLCheckpointState,
state_type_class=GenericCheckpointState,
pipeline_name=self.ctx.pipeline_name,
run_id=self.ctx.run_id,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import base64
import bz2
import contextlib
import functools
import json
import logging
Expand Down Expand Up @@ -128,7 +129,9 @@ def create_from_checkpoint_aspect(
)
elif checkpoint_aspect.state.serde == "base85":
state_obj = Checkpoint._from_base85_bytes(
checkpoint_aspect, functools.partial(bz2.decompress)
checkpoint_aspect,
functools.partial(bz2.decompress),
state_class,
)
elif checkpoint_aspect.state.serde == "base85-bz2-json":
state_obj = Checkpoint._from_base85_json_bytes(
Expand Down Expand Up @@ -177,11 +180,18 @@ def _from_utf8_bytes(
def _from_base85_bytes(
checkpoint_aspect: DatahubIngestionCheckpointClass,
decompressor: Callable[[bytes], bytes],
state_class: Type[StateType],
) -> StateType:
state: StateType = pickle.loads(
decompressor(base64.b85decode(checkpoint_aspect.state.payload)) # type: ignore
)

with contextlib.suppress(Exception):
# When loading from pickle, the pydantic validators don't run.
# By re-serializing and re-parsing, we ensure that the state is valid.
# However, we also suppress any exceptions to make sure this doesn't blow up.
state = state_class.parse_obj(state.dict())

# Because the base85 method is deprecated in favor of base85-bz2-json,
# we will automatically switch the serde.
state.serde = "base85-bz2-json"
Expand Down
101 changes: 10 additions & 91 deletions metadata-ingestion/src/datahub/ingestion/source/state/dbt_state.py
Original file line number Diff line number Diff line change
@@ -1,102 +1,21 @@
import logging
from typing import Callable, Dict, Iterable, List

import pydantic

from datahub.emitter.mce_builder import make_assertion_urn
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityCheckpointStateBase,
from datahub.ingestion.source.state.entity_removal_state import (
GenericCheckpointState,
pydantic_state_migrator,
)
from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
from datahub.utilities.urns.urn import Urn

logger = logging.getLogger(__name__)


class DbtCheckpointState(StaleEntityCheckpointStateBase["DbtCheckpointState"]):
class DbtCheckpointState(GenericCheckpointState):
"""
Class for representing the checkpoint state for DBT sources.
Stores all nodes and assertions being ingested and is used to remove any stale entities.
"""

encoded_node_urns: List[str] = pydantic.Field(default_factory=list)
encoded_assertion_urns: List[str] = pydantic.Field(default_factory=list)

@classmethod
def get_supported_types(cls) -> List[str]:
return ["assertion", "dataset"]

@staticmethod
def _get_assertion_lightweight_repr(assertion_urn: str) -> str:
"""Reduces the amount of text in the URNs for smaller state footprint."""
urn = Urn.create_from_string(assertion_urn)
key = urn.get_entity_id_as_string()
assert key is not None
return key

def _add_assertion_urn(self, assertion_urn: str) -> None:
self.encoded_assertion_urns.append(
self._get_assertion_lightweight_repr(assertion_urn)
)

def _get_assertion_urns_not_in(
self, checkpoint: "DbtCheckpointState"
) -> Iterable[str]:
"""
Dbt assertion are mapped to DataHub assertion concept
"""
difference = CheckpointStateUtil.get_encoded_urns_not_in(
self.encoded_assertion_urns, checkpoint.encoded_assertion_urns
)
for key in difference:
yield make_assertion_urn(key)

def _get_node_urns_not_in(self, checkpoint: "DbtCheckpointState") -> Iterable[str]:
"""
Dbt node are mapped to DataHub dataset concept
"""
yield from CheckpointStateUtil.get_dataset_urns_not_in(
self.encoded_node_urns, checkpoint.encoded_node_urns
)

def _add_node_urn(self, node_urn: str) -> None:
self.encoded_node_urns.append(
CheckpointStateUtil.get_dataset_lightweight_repr(node_urn)
)

def add_checkpoint_urn(self, type: str, urn: str) -> None:
supported_entities_add_handlers: Dict[str, Callable[[str], None]] = {
"dataset": self._add_node_urn,
"assertion": self._add_assertion_urn,
_migration = pydantic_state_migrator(
{
"encoded_node_urns": "dataset",
"encoded_assertion_urns": "assertion",
}

if type not in supported_entities_add_handlers:
logger.error(f"Can not save Unknown entity {type} to checkpoint.")

supported_entities_add_handlers[type](urn)

def get_urns_not_in(
self, type: str, other_checkpoint_state: "DbtCheckpointState"
) -> Iterable[str]:
assert type in self.get_supported_types()
if type == "dataset":
yield from self._get_node_urns_not_in(other_checkpoint_state)
elif type == "assertion":
yield from self._get_assertion_urns_not_in(other_checkpoint_state)

def get_percent_entities_changed(
self, old_checkpoint_state: "DbtCheckpointState"
) -> float:
return StaleEntityCheckpointStateBase.compute_percent_entities_changed(
[
(self.encoded_node_urns, old_checkpoint_state.encoded_node_urns),
(
self.encoded_assertion_urns,
old_checkpoint_state.encoded_assertion_urns,
),
]
)
)

def prepare_for_commit(self) -> None:
self.encoded_node_urns = list(set(self.encoded_node_urns))
self.encoded_assertion_urns = list(set(self.encoded_assertion_urns))
self.urns = list(set(self.urns))
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Dict, Iterable, List, Type

import pydantic

from datahub.emitter.mce_builder import make_assertion_urn, make_container_urn
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityCheckpointStateBase,
)
from datahub.utilities.checkpoint_state_util import CheckpointStateUtil
from datahub.utilities.urns.urn import guess_entity_type


class GenericCheckpointState(StaleEntityCheckpointStateBase["GenericCheckpointState"]):
urns: List[str] = pydantic.Field(default_factory=list)

@classmethod
def get_supported_types(cls) -> List[str]:
return ["*"]

def add_checkpoint_urn(self, type: str, urn: str) -> None:
# TODO: dedup
self.urns.append(urn)

def get_urns_not_in(
self, type: str, other_checkpoint_state: "GenericCheckpointState"
) -> Iterable[str]:
diff = set(self.urns) - set(other_checkpoint_state.urns)

# To maintain backwards compatibility, we provide this filtering mechanism.
if type == "*":
yield from diff
else:
yield from (urn for urn in diff if guess_entity_type(urn) == type)

def get_percent_entities_changed(
self, old_checkpoint_state: "GenericCheckpointState"
) -> float:
return StaleEntityCheckpointStateBase.compute_percent_entities_changed(
[(self.urns, old_checkpoint_state.urns)]
)


def pydantic_state_migrator(mapping: Dict[str, str]) -> classmethod:
# mapping would be something like:
# {
# 'encoded_view_urns': 'dataset',
# 'encoded_container_urns': 'container',
# }

SUPPORTED_TYPES = [
"dataset",
"container",
"assertion",
]
assert set(mapping.values()) <= set(SUPPORTED_TYPES)

def _validate_field_rename(cls: Type, values: dict) -> dict:
values.setdefault("urns", [])

for old_field, mapped_type in mapping.items():
if old_field not in values:
continue

value = values.pop(old_field)
if mapped_type == "dataset":
values["urns"] += CheckpointStateUtil.get_dataset_urns_not_in(value, [])
elif mapped_type == "container":
values["urns"] += [make_container_urn(guid) for guid in value]
elif mapped_type == "assertion":
values["urns"] += [make_assertion_urn(encoded) for encoded in value]
else:
raise ValueError(f"Unsupported type {mapped_type}")

return values

return pydantic.root_validator(pre=True, allow_reuse=True)(_validate_field_rename)

This file was deleted.

This file was deleted.

Loading