-
Notifications
You must be signed in to change notification settings - Fork 80
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added collection of used tables from Python notebooks and files and SQL queries #2772
Changes from 10 commits
9ab6ace
ced75c3
b3f4811
0a2af4f
f74bf08
5cbeae1
5a6552d
e5856fa
a4fb53e
89fadd8
fef2f68
1865680
1541c75
198bc9f
a0567b4
4541dd1
118b094
9cc3677
6421acc
be59a8f
9b35060
5b9b763
21b7942
c8abbb9
bf8747b
e0509b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
SELECT | ||
catalog_name, | ||
schema_name, | ||
table_name, | ||
source_id, | ||
source_timestamp, | ||
source_lineage, | ||
assessment_start_timestamp, | ||
assessment_end_timestamp | ||
FROM $inventory.table_infos_in_paths | ||
UNION ALL | ||
SELECT | ||
catalog_name, | ||
schema_name, | ||
table_name, | ||
source_id, | ||
source_timestamp, | ||
source_lineage, | ||
assessment_start_timestamp, | ||
assessment_end_timestamp | ||
FROM $inventory.table_infos_in_queries |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,8 +6,10 @@ | |
import logging | ||
from abc import abstractmethod, ABC | ||
from collections.abc import Iterable | ||
from dataclasses import dataclass | ||
from dataclasses import dataclass, field | ||
from datetime import datetime | ||
from pathlib import Path | ||
from typing import Self, Any | ||
|
||
from astroid import AstroidSyntaxError, NodeNG # type: ignore | ||
from sqlglot import Expression, parse as parse_sql, ParseError as SqlParseError | ||
|
@@ -174,6 +176,145 @@ def name(self) -> str: ... | |
def apply(self, code: str) -> str: ... | ||
|
||
|
||
@dataclass | ||
class LineageAtom: | ||
|
||
object_type: str | ||
object_id: str | ||
other: dict[str, str] | None = None | ||
|
||
|
||
@dataclass | ||
class SourceInfo: | ||
|
||
@classmethod | ||
def from_dict(cls, data: dict[str, Any]) -> Self: | ||
source_lineage = data.get("source_lineage", None) | ||
if isinstance(source_lineage, list) and len(source_lineage) > 0 and isinstance(source_lineage[0], dict): | ||
lineage_atoms = [LineageAtom(**lineage) for lineage in source_lineage] | ||
data["source_lineage"] = lineage_atoms | ||
return cls(**data) | ||
|
||
UNKNOWN = "unknown" | ||
|
||
source_id: str = UNKNOWN | ||
source_timestamp: datetime = datetime.fromtimestamp(0) | ||
source_lineage: list[LineageAtom] = field(default_factory=list) | ||
assessment_start_timestamp: datetime = datetime.fromtimestamp(0) | ||
assessment_end_timestamp: datetime = datetime.fromtimestamp(0) | ||
|
||
def replace_source( | ||
self, | ||
source_id: str | None = None, | ||
source_lineage: list[LineageAtom] | None = None, | ||
source_timestamp: datetime | None = None, | ||
): | ||
return dataclasses.replace( | ||
self, | ||
source_id=source_id or self.source_id, | ||
source_timestamp=source_timestamp or self.source_timestamp, | ||
source_lineage=source_lineage or self.source_lineage, | ||
) | ||
|
||
def replace_assessment_infos( | ||
self, assessment_start: datetime | None = None, assessment_end: datetime | None = None | ||
): | ||
return dataclasses.replace( | ||
self, | ||
assessment_start_timestamp=assessment_start or self.assessment_start_timestamp, | ||
assessment_end_timestamp=assessment_end or self.assessment_end_timestamp, | ||
) | ||
|
||
|
||
@dataclass | ||
class TableInfo(SourceInfo): | ||
|
||
@classmethod | ||
def parse(cls, value: str, default_schema: str) -> TableInfo: | ||
parts = value.split(".") | ||
if len(parts) >= 3: | ||
catalog_name = parts.pop(0) | ||
else: | ||
catalog_name = "hive_metastore" | ||
if len(parts) >= 2: | ||
schema_name = parts.pop(0) | ||
else: | ||
schema_name = default_schema | ||
return TableInfo(catalog_name=catalog_name, schema_name=schema_name, table_name=parts[0]) | ||
|
||
catalog_name: str = SourceInfo.UNKNOWN | ||
schema_name: str = SourceInfo.UNKNOWN | ||
table_name: str = SourceInfo.UNKNOWN | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we also add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Populated for sql. For python calls, I suggest doing in a separate PR since it's a lot of work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separate PR works |
||
|
||
|
||
class TableCollector(ABC): | ||
|
||
@abstractmethod | ||
def collect_tables(self, source_code: str) -> Iterable[TableInfo]: ... | ||
|
||
|
||
@dataclass | ||
class TableInfoNode: | ||
table: TableInfo | ||
node: NodeNG | ||
|
||
|
||
class TablePyCollector(TableCollector, ABC): | ||
|
||
def collect_tables(self, source_code: str): | ||
tree = Tree.normalize_and_parse(source_code) | ||
for table_node in self.collect_tables_from_tree(tree): | ||
yield table_node.table | ||
|
||
@abstractmethod | ||
def collect_tables_from_source(self, source_code: str, inherited_tree: Tree | None) -> Iterable[TableInfoNode]: ... | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this method is not used in this abstract class There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed |
||
@abstractmethod | ||
def collect_tables_from_tree(self, tree: Tree) -> Iterable[TableInfoNode]: ... | ||
|
||
|
||
class TableSqlCollector(TableCollector, ABC): ... | ||
|
||
|
||
@dataclass | ||
class DirectFsAccess(SourceInfo): | ||
"""A record describing a Direct File System Access""" | ||
|
||
path: str = SourceInfo.UNKNOWN | ||
is_read: bool = False | ||
is_write: bool = False | ||
|
||
|
||
@dataclass | ||
class DirectFsAccessNode: | ||
dfsa: DirectFsAccess | ||
node: NodeNG | ||
|
||
|
||
class DfsaCollector(ABC): | ||
|
||
@abstractmethod | ||
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: ... | ||
|
||
|
||
class DfsaPyCollector(DfsaCollector, ABC): | ||
|
||
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: | ||
tree = Tree.normalize_and_parse(source_code) | ||
for dfsa_node in self.collect_dfsas_from_tree(tree): | ||
yield dfsa_node.dfsa | ||
|
||
@abstractmethod | ||
def collect_dfsas_from_source( | ||
self, source_code: str, inherited_tree: Tree | None | ||
) -> Iterable[DirectFsAccessNode]: ... | ||
|
||
@abstractmethod | ||
def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: ... | ||
|
||
|
||
class DfsaSqlCollector(DfsaCollector, ABC): ... | ||
|
||
|
||
# The default schema to use when the schema is not specified in a table reference | ||
# See: https://spark.apache.org/docs/3.0.0-preview/sql-ref-syntax-qry-select-usedb.html | ||
DEFAULT_CATALOG = 'hive_metastore' | ||
|
@@ -221,20 +362,42 @@ def parse_security_mode(mode_str: str | None) -> compute.DataSecurityMode | None | |
return None | ||
|
||
|
||
class SqlSequentialLinter(SqlLinter): | ||
class SqlSequentialLinter(SqlLinter, DfsaCollector, TableCollector): | ||
|
||
def __init__(self, linters: list[SqlLinter]): | ||
def __init__( | ||
self, | ||
linters: list[SqlLinter], | ||
dfsa_collectors: list[DfsaSqlCollector], | ||
table_collectors: list[TableSqlCollector], | ||
): | ||
self._linters = linters | ||
self._dfsa_collectors = dfsa_collectors | ||
self._table_collectors = table_collectors | ||
|
||
def lint_expression(self, expression: Expression) -> Iterable[Advice]: | ||
for linter in self._linters: | ||
yield from linter.lint_expression(expression) | ||
|
||
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: | ||
for collector in self._dfsa_collectors: | ||
yield from collector.collect_dfsas(source_code) | ||
|
||
def collect_tables(self, source_code: str) -> Iterable[TableInfo]: | ||
for collector in self._table_collectors: | ||
yield from collector.collect_tables(source_code) | ||
|
||
|
||
class PythonSequentialLinter(Linter): | ||
class PythonSequentialLinter(Linter, DfsaCollector, TableCollector): | ||
|
||
def __init__(self, linters: list[PythonLinter]): | ||
def __init__( | ||
self, | ||
linters: list[PythonLinter], | ||
dfsa_collectors: list[DfsaPyCollector], | ||
table_collectors: list[TablePyCollector], | ||
): | ||
self._linters = linters | ||
self._dfsa_collectors = dfsa_collectors | ||
self._table_collectors = table_collectors | ||
self._tree: Tree | None = None | ||
|
||
def lint(self, code: str) -> Iterable[Advice]: | ||
|
@@ -271,6 +434,30 @@ def process_child_cell(self, code: str): | |
# error already reported when linting enclosing notebook | ||
logger.warning(f"Failed to parse Python cell: {code}", exc_info=e) | ||
|
||
def collect_dfsas(self, source_code: str) -> Iterable[DirectFsAccess]: | ||
try: | ||
tree = self._parse_and_append(source_code) | ||
for dfsa_node in self.collect_dfsas_from_tree(tree): | ||
yield dfsa_node.dfsa | ||
except AstroidSyntaxError as e: | ||
logger.warning('syntax-error', exc_info=e) | ||
|
||
def collect_dfsas_from_tree(self, tree: Tree) -> Iterable[DirectFsAccessNode]: | ||
for collector in self._dfsa_collectors: | ||
yield from collector.collect_dfsas_from_tree(tree) | ||
|
||
def collect_tables(self, source_code: str) -> Iterable[TableInfo]: | ||
try: | ||
tree = self._parse_and_append(source_code) | ||
for table_node in self.collect_tables_from_tree(tree): | ||
yield table_node.table | ||
except AstroidSyntaxError as e: | ||
logger.warning('syntax-error', exc_info=e) | ||
|
||
def collect_tables_from_tree(self, tree: Tree) -> Iterable[TableInfoNode]: | ||
for collector in self._table_collectors: | ||
yield from collector.collect_tables_from_tree(tree) | ||
|
||
def _make_tree(self) -> Tree: | ||
if self._tree is None: | ||
self._tree = Tree.new_module() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we already have table info - https://databricks-sdk-py.readthedocs.io/en/latest/dbdataclasses/catalog.html#databricks.sdk.service.catalog.TableInfo
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done