Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connected WorkspacePathOwnership with DirectFsAccessOwnership #3049

Merged
merged 3 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/databricks/labs/ucx/contexts/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever
from databricks.labs.ucx.recon.migration_recon import MigrationRecon
from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler, DirectFsAccessOwnership
from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver
from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler
from databricks.sdk import AccountClient, WorkspaceClient, core
Expand Down Expand Up @@ -269,6 +269,14 @@ def table_ownership(self) -> TableOwnership:
def workspace_path_ownership(self) -> WorkspacePathOwnership:
return WorkspacePathOwnership(self.administrator_locator, self.workspace_client)

@cached_property
def directfs_access_ownership(self) -> DirectFsAccessOwnership:
return DirectFsAccessOwnership(
self.administrator_locator,
self.workspace_path_ownership,
self.workspace_client,
)

@cached_property
def tables_migrator(self) -> TablesMigrator:
return TablesMigrator(
Expand Down
7 changes: 7 additions & 0 deletions src/databricks/labs/ucx/source_code/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,13 @@ def replace_assessment_infos(
assessment_end_timestamp=assessment_end or self.assessment_end_timestamp,
)

@property
def source_type(self) -> str | None:
if not self.source_lineage:
return None
last = self.source_lineage[-1]
return last.object_type


@dataclass
class UsedTable(SourceInfo):
Expand Down
40 changes: 34 additions & 6 deletions src/databricks/labs/ucx/source_code/directfs_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import logging
from collections.abc import Sequence, Iterable

from databricks.labs.blueprint.paths import WorkspacePath
from databricks.sdk import WorkspaceClient

from databricks.labs.ucx.framework.crawlers import CrawlerBase
from databricks.labs.lsql.backends import SqlBackend
from databricks.sdk.errors import DatabricksError
from databricks.sdk.errors import DatabricksError, NotFound

from databricks.labs.ucx.framework.owners import Ownership
from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator, WorkspacePathOwnership
from databricks.labs.ucx.framework.utils import escape_sql_identifier
from databricks.labs.ucx.source_code.base import DirectFsAccess

Expand Down Expand Up @@ -62,10 +65,35 @@ class DirectFsAccessOwnership(Ownership[DirectFsAccess]):

- For queries, the creator of the query (if known).
- For jobs, the owner of the path for the notebook or source (if known).

At present this information is not gathered during the crawling process, so it can't be reported here.
"""

def _maybe_direct_owner(self, record: DirectFsAccess) -> None:
# TODO: Implement this once the creator/ownership information is exposed during crawling.
def __init__(
self,
administrator_locator: AdministratorLocator,
workspace_path_ownership: WorkspacePathOwnership,
workspace_client: WorkspaceClient,
) -> None:
super().__init__(administrator_locator)
self._workspace_path_ownership = workspace_path_ownership
self._workspace_client = workspace_client

def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None:
if record.source_type == 'QUERY':
return self._query_owner(record)
if record.source_type in {'NOTEBOOK', 'FILE'}:
return self._notebook_owner(record)
logger.warning(f"Unknown source type {record.source_type} for {record.source_id}")
return None

def _notebook_owner(self, record):
try:
workspace_path = WorkspacePath(self._workspace_client, record.source_id)
owner = self._workspace_path_ownership.owner_of(workspace_path)
return owner
except NotFound:
return None

def _query_owner(self, record):
query_id = record.source_lineage[-1].object_id.split('/')[1]
legacy_query = self._workspace_client.queries.get(query_id)
return legacy_query.owner_user_name
20 changes: 10 additions & 10 deletions tests/integration/source_code/test_directfs_access.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import pytest

from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex
from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessOwnership
from databricks.labs.ucx.source_code.jobs import WorkflowLinter
from databricks.labs.ucx.source_code.queries import QueryLinter


@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.")
def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory_schema, sql_backend) -> None:
"""Verify the ownership of a direct-fs record for a query."""

Expand All @@ -29,13 +25,17 @@ def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory
query_record = next(record for record in records if record.source_id == f"{dashboard.id}/{query.id}")

# Verify ownership can be made.
ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator)
assert ownership.owner_of(query_record) == runtime_ctx.workspace_client.current_user.me().user_name
owner = runtime_ctx.directfs_access_ownership.owner_of(query_record)
assert owner == runtime_ctx.workspace_client.current_user.me().user_name


@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.")
def test_path_dfsa_ownership(
runtime_ctx, make_notebook, make_job, make_directory, inventory_schema, sql_backend
runtime_ctx,
make_notebook,
make_job,
make_directory,
inventory_schema,
sql_backend,
) -> None:
"""Verify the ownership of a direct-fs record for a notebook/source path associated with a job."""

Expand All @@ -61,5 +61,5 @@ def test_path_dfsa_ownership(
path_record = next(record for record in records if record.source_id == str(notebook))

# Verify ownership can be made.
ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator)
assert ownership.owner_of(path_record) == runtime_ctx.workspace_client.current_user.me().user_name
owner = runtime_ctx.directfs_access_ownership.owner_of(path_record)
assert owner == runtime_ctx.workspace_client.current_user.me().user_name
16 changes: 10 additions & 6 deletions tests/unit/source_code/test_directfs_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
from unittest.mock import create_autospec

from databricks.labs.lsql.backends import MockBackend
from databricks.sdk import WorkspaceClient

from databricks.labs.ucx.framework.owners import AdministratorLocator
from databricks.labs.ucx.framework.owners import AdministratorLocator, WorkspacePathOwnership
from databricks.labs.ucx.source_code.base import LineageAtom
from databricks.labs.ucx.source_code.directfs_access import (
DirectFsAccessCrawler,
Expand Down Expand Up @@ -38,12 +39,15 @@ def test_crawler_appends_dfsas() -> None:

def test_directfs_access_ownership() -> None:
"""Verify that the owner for a direct-fs access record is an administrator."""
ws = create_autospec(WorkspaceClient)
admin_locator = create_autospec(AdministratorLocator)
admin_locator.get_workspace_administrator.return_value = "an_admin"
workspace_path_ownership = create_autospec(WorkspacePathOwnership)
workspace_path_ownership.owner_of.return_value = "other_admin"

ownership = DirectFsAccessOwnership(admin_locator)
dfsa = DirectFsAccess()
ownership = DirectFsAccessOwnership(admin_locator, workspace_path_ownership, ws)
dfsa = DirectFsAccess(source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="/x/y/z")])
owner = ownership.owner_of(dfsa)

assert owner == "an_admin"
admin_locator.get_workspace_administrator.assert_called_once()
assert owner == "other_admin"
ws.queries.get.assert_not_called()
admin_locator.get_workspace_administrator.assert_not_called()