From 0b49726a2feb8b38d6cd712dc281dbf0c05bc10d Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Wed, 23 Oct 2024 17:19:36 +0200 Subject: [PATCH 1/3] Connected `WorkspacePathOwnership` with `DirectFsAccessOwnership` --- .../labs/ucx/contexts/application.py | 6 +++- .../labs/ucx/source_code/directfs_access.py | 32 +++++++++++++++---- .../source_code/test_directfs_access.py | 4 +-- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index 24132afd1f..aea97f3566 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -17,7 +17,7 @@ from databricks.labs.ucx.recon.metadata_retriever import DatabricksTableMetadataRetriever from databricks.labs.ucx.recon.migration_recon import MigrationRecon from databricks.labs.ucx.recon.schema_comparator import StandardSchemaComparator -from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler +from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessCrawler, DirectFsAccessOwnership from databricks.labs.ucx.source_code.python_libraries import PythonLibraryResolver from databricks.labs.ucx.source_code.used_table import UsedTablesCrawler from databricks.sdk import AccountClient, WorkspaceClient, core @@ -269,6 +269,10 @@ def table_ownership(self) -> TableOwnership: def workspace_path_ownership(self) -> WorkspacePathOwnership: return WorkspacePathOwnership(self.administrator_locator, self.workspace_client) + @cached_property + def directfs_access_ownership(self) -> DirectFsAccessOwnership: + return DirectFsAccessOwnership(self.administrator_locator, self.workspace_path_ownership, self.workspace_client,) + @cached_property def tables_migrator(self) -> TablesMigrator: return TablesMigrator( diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index c7e16cad2f..acdd773097 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -3,11 +3,14 @@ import logging from collections.abc import Sequence, Iterable +from databricks.labs.blueprint.paths import WorkspacePath +from databricks.sdk import WorkspaceClient + from databricks.labs.ucx.framework.crawlers import CrawlerBase from databricks.labs.lsql.backends import SqlBackend -from databricks.sdk.errors import DatabricksError +from databricks.sdk.errors import DatabricksError, NotFound -from databricks.labs.ucx.framework.owners import Ownership +from databricks.labs.ucx.framework.owners import Ownership, AdministratorLocator, WorkspacePathOwnership from databricks.labs.ucx.framework.utils import escape_sql_identifier from databricks.labs.ucx.source_code.base import DirectFsAccess @@ -62,10 +65,25 @@ class DirectFsAccessOwnership(Ownership[DirectFsAccess]): - For queries, the creator of the query (if known). - For jobs, the owner of the path for the notebook or source (if known). - - At present this information is not gathered during the crawling process, so it can't be reported here. """ - def _maybe_direct_owner(self, record: DirectFsAccess) -> None: - # TODO: Implement this once the creator/ownership information is exposed during crawling. - return None + def __init__( + self, + administrator_locator: AdministratorLocator, + workspace_path_ownership: WorkspacePathOwnership, + workspace_client: WorkspaceClient, + ) -> None: + super().__init__(administrator_locator) + self._workspace_path_ownership = workspace_path_ownership + self._workspace_client = workspace_client + + def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None: + if record.source_lineage and record.source_lineage[-1].object_type == 'QUERY': + query_id = record.source_lineage[-1].object_id.split('/')[1] + legacy_query = self._workspace_client.queries.get(query_id) + return legacy_query.owner_user_name + try: + workspace_path = WorkspacePath(self._workspace_client, record.path) + return self._workspace_path_ownership.owner_of(workspace_path) + except NotFound: + return None diff --git a/tests/integration/source_code/test_directfs_access.py b/tests/integration/source_code/test_directfs_access.py index a462040614..329fba192d 100644 --- a/tests/integration/source_code/test_directfs_access.py +++ b/tests/integration/source_code/test_directfs_access.py @@ -29,8 +29,8 @@ def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory query_record = next(record for record in records if record.source_id == f"{dashboard.id}/{query.id}") # Verify ownership can be made. - ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator) - assert ownership.owner_of(query_record) == runtime_ctx.workspace_client.current_user.me().user_name + owner = runtime_ctx.directfs_access_ownership.owner_of(query_record) + assert owner == runtime_ctx.workspace_client.current_user.me().user_name @pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.") From 60db1aa2e5ce97a49106b807633b82980fbca6e2 Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Thu, 24 Oct 2024 12:56:14 +0200 Subject: [PATCH 2/3] ... --- .../labs/ucx/contexts/application.py | 6 ++++- src/databricks/labs/ucx/source_code/base.py | 7 ++++++ .../labs/ucx/source_code/directfs_access.py | 22 ++++++++++++++----- .../source_code/test_directfs_access.py | 16 +++++++------- .../unit/source_code/test_directfs_access.py | 14 +++++++----- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/src/databricks/labs/ucx/contexts/application.py b/src/databricks/labs/ucx/contexts/application.py index aea97f3566..300051c7e8 100644 --- a/src/databricks/labs/ucx/contexts/application.py +++ b/src/databricks/labs/ucx/contexts/application.py @@ -271,7 +271,11 @@ def workspace_path_ownership(self) -> WorkspacePathOwnership: @cached_property def directfs_access_ownership(self) -> DirectFsAccessOwnership: - return DirectFsAccessOwnership(self.administrator_locator, self.workspace_path_ownership, self.workspace_client,) + return DirectFsAccessOwnership( + self.administrator_locator, + self.workspace_path_ownership, + self.workspace_client, + ) @cached_property def tables_migrator(self) -> TablesMigrator: diff --git a/src/databricks/labs/ucx/source_code/base.py b/src/databricks/labs/ucx/source_code/base.py index 85f5b598f6..6d46e9d600 100644 --- a/src/databricks/labs/ucx/source_code/base.py +++ b/src/databricks/labs/ucx/source_code/base.py @@ -224,6 +224,13 @@ def replace_assessment_infos( assessment_end_timestamp=assessment_end or self.assessment_end_timestamp, ) + @property + def source_type(self) -> str | None: + if not self.source_lineage: + return None + last = self.source_lineage[-1] + return last.object_type + @dataclass class UsedTable(SourceInfo): diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index acdd773097..e55906758f 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -78,12 +78,22 @@ def __init__( self._workspace_client = workspace_client def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None: - if record.source_lineage and record.source_lineage[-1].object_type == 'QUERY': - query_id = record.source_lineage[-1].object_id.split('/')[1] - legacy_query = self._workspace_client.queries.get(query_id) - return legacy_query.owner_user_name + if record.source_type == 'QUERY': + return self._query_owner(record) + if record.source_type == 'NOTEBOOK': + return self._notebook_owner(record) + logger.warning(f"Unknown source type {record.source_type} for {record.source_id}") + return None + + def _notebook_owner(self, record): try: - workspace_path = WorkspacePath(self._workspace_client, record.path) - return self._workspace_path_ownership.owner_of(workspace_path) + workspace_path = WorkspacePath(self._workspace_client, record.source_id) + owner = self._workspace_path_ownership.owner_of(workspace_path) + return owner except NotFound: return None + + def _query_owner(self, record): + query_id = record.source_lineage[-1].object_id.split('/')[1] + legacy_query = self._workspace_client.queries.get(query_id) + return legacy_query.owner_user_name diff --git a/tests/integration/source_code/test_directfs_access.py b/tests/integration/source_code/test_directfs_access.py index 329fba192d..330b60abdd 100644 --- a/tests/integration/source_code/test_directfs_access.py +++ b/tests/integration/source_code/test_directfs_access.py @@ -1,12 +1,8 @@ -import pytest - from databricks.labs.ucx.hive_metastore.table_migration_status import TableMigrationIndex -from databricks.labs.ucx.source_code.directfs_access import DirectFsAccessOwnership from databricks.labs.ucx.source_code.jobs import WorkflowLinter from databricks.labs.ucx.source_code.queries import QueryLinter -@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.") def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory_schema, sql_backend) -> None: """Verify the ownership of a direct-fs record for a query.""" @@ -33,9 +29,13 @@ def test_query_dfsa_ownership(runtime_ctx, make_query, make_dashboard, inventory assert owner == runtime_ctx.workspace_client.current_user.me().user_name -@pytest.mark.xfail(reason="DirectFS access records don't currently include creator/owner information.") def test_path_dfsa_ownership( - runtime_ctx, make_notebook, make_job, make_directory, inventory_schema, sql_backend + runtime_ctx, + make_notebook, + make_job, + make_directory, + inventory_schema, + sql_backend, ) -> None: """Verify the ownership of a direct-fs record for a notebook/source path associated with a job.""" @@ -61,5 +61,5 @@ def test_path_dfsa_ownership( path_record = next(record for record in records if record.source_id == str(notebook)) # Verify ownership can be made. - ownership = DirectFsAccessOwnership(runtime_ctx.administrator_locator) - assert ownership.owner_of(path_record) == runtime_ctx.workspace_client.current_user.me().user_name + owner = runtime_ctx.directfs_access_ownership.owner_of(path_record) + assert owner == runtime_ctx.workspace_client.current_user.me().user_name diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index c00c1cdcc6..6c618c1edb 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -2,8 +2,9 @@ from unittest.mock import create_autospec from databricks.labs.lsql.backends import MockBackend +from databricks.sdk import WorkspaceClient -from databricks.labs.ucx.framework.owners import AdministratorLocator +from databricks.labs.ucx.framework.owners import AdministratorLocator, WorkspacePathOwnership from databricks.labs.ucx.source_code.base import LineageAtom from databricks.labs.ucx.source_code.directfs_access import ( DirectFsAccessCrawler, @@ -38,12 +39,15 @@ def test_crawler_appends_dfsas() -> None: def test_directfs_access_ownership() -> None: """Verify that the owner for a direct-fs access record is an administrator.""" + ws = create_autospec(WorkspaceClient) admin_locator = create_autospec(AdministratorLocator) - admin_locator.get_workspace_administrator.return_value = "an_admin" + workspace_path_ownership = create_autospec(WorkspacePathOwnership) + workspace_path_ownership.owner_of.return_value = "other_admin" - ownership = DirectFsAccessOwnership(admin_locator) + ownership = DirectFsAccessOwnership(admin_locator, workspace_path_ownership, ws) dfsa = DirectFsAccess() owner = ownership.owner_of(dfsa) - assert owner == "an_admin" - admin_locator.get_workspace_administrator.assert_called_once() + assert owner == "other_admin" + ws.queries.get.assert_not_called() + admin_locator.get_workspace_administrator.assert_not_called() From e96c6e02e9b174b7a734f0f9bee714fc87b55764 Mon Sep 17 00:00:00 2001 From: Serge Smertin Date: Thu, 24 Oct 2024 13:02:59 +0200 Subject: [PATCH 3/3] ... --- src/databricks/labs/ucx/source_code/directfs_access.py | 2 +- tests/unit/source_code/test_directfs_access.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/databricks/labs/ucx/source_code/directfs_access.py b/src/databricks/labs/ucx/source_code/directfs_access.py index e55906758f..42406d3b13 100644 --- a/src/databricks/labs/ucx/source_code/directfs_access.py +++ b/src/databricks/labs/ucx/source_code/directfs_access.py @@ -80,7 +80,7 @@ def __init__( def _maybe_direct_owner(self, record: DirectFsAccess) -> str | None: if record.source_type == 'QUERY': return self._query_owner(record) - if record.source_type == 'NOTEBOOK': + if record.source_type in {'NOTEBOOK', 'FILE'}: return self._notebook_owner(record) logger.warning(f"Unknown source type {record.source_type} for {record.source_id}") return None diff --git a/tests/unit/source_code/test_directfs_access.py b/tests/unit/source_code/test_directfs_access.py index 6c618c1edb..5a9aadee80 100644 --- a/tests/unit/source_code/test_directfs_access.py +++ b/tests/unit/source_code/test_directfs_access.py @@ -45,7 +45,7 @@ def test_directfs_access_ownership() -> None: workspace_path_ownership.owner_of.return_value = "other_admin" ownership = DirectFsAccessOwnership(admin_locator, workspace_path_ownership, ws) - dfsa = DirectFsAccess() + dfsa = DirectFsAccess(source_lineage=[LineageAtom(object_type="NOTEBOOK", object_id="/x/y/z")]) owner = ownership.owner_of(dfsa) assert owner == "other_admin"