Skip to content

Commit

Permalink
Include orphans in manifest when filtering by only project/dataset (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Oct 9, 2024
1 parent 211ff8c commit f31c752
Show file tree
Hide file tree
Showing 11 changed files with 58 additions and 20 deletions.
3 changes: 3 additions & 0 deletions src/azul/indexer/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,6 @@ def translate_fields(self,
self.field_types(catalog),
forward=forward,
allowed_paths=allowed_paths)

def always_limit_access(self) -> bool:
return True
1 change: 1 addition & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class SpecialFields:
source_spec: FieldName
bundle_uuid: FieldName
bundle_version: FieldName
implicit_hub_id: FieldName


class ManifestFormat(Enum):
Expand Down
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='source_id',
source_spec='source_spec',
bundle_uuid='bundle_uuid',
bundle_version='bundle_version')
bundle_version='bundle_version',
implicit_hub_id='datasets.dataset_id')

@property
def implicit_hub_type(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/azul/plugins/metadata/anvil/service/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
class AnvilFilterStage(FilterStage):

def _limit_access(self) -> bool:
return self.entity_type != 'datasets'
return self.service.always_limit_access() or self.entity_type != 'datasets'
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='sourceId',
source_spec='sourceSpec',
bundle_uuid='bundleUuid',
bundle_version='bundleVersion')
bundle_version='bundleVersion',
implicit_hub_id='projectId')

@property
def implicit_hub_type(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/azul/plugins/metadata/hca/service/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
class HCAFilterStage(FilterStage):

def _limit_access(self) -> bool:
return self.entity_type != 'projects'
return self.service.always_limit_access() or self.entity_type != 'projects'
12 changes: 10 additions & 2 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1985,7 +1985,7 @@ class VerbatimManifestGenerator(FileBasedManifestGenerator, metaclass=ABCMeta):

@property
def entity_type(self) -> str:
return 'files'
return self.implicit_hub_type if self.include_orphans else 'files'

@property
def included_fields(self) -> list[FieldPath]:
Expand All @@ -2001,6 +2001,11 @@ def included_fields(self) -> list[FieldPath]:
def implicit_hub_type(self) -> str:
return self.service.metadata_plugin(self.catalog).implicit_hub_type

@property
def include_orphans(self) -> bool:
special_fields = self.service.metadata_plugin(self.catalog).special_fields
return self.filters.explicit.keys() == {special_fields.implicit_hub_id}

@attrs.frozen(kw_only=True)
class ReplicaKeys:
"""
Expand All @@ -2019,8 +2024,11 @@ def _replica_keys(self) -> Iterable[ReplicaKeys]:
hub_type = self.implicit_hub_type
request = self._create_request()
for hit in request.scan():
replica_id = one(hit['contents'][hub_type])['document_id']
if self.entity_type != hub_type:
replica_id = one(replica_id)
yield self.ReplicaKeys(hub_id=hit['entity_id'],
replica_id=one(one(hit['contents'][hub_type])['document_id']))
replica_id=replica_id)

def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids = set()
Expand Down
3 changes: 3 additions & 0 deletions src/azul/service/repository_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,6 @@ def _hit_to_doc(hit: Hit) -> JSON:
if file_version is not None:
assert file_version == file['version']
return file

def always_limit_access(self) -> bool:
return False
3 changes: 2 additions & 1 deletion test/service/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class TestFilterReification(AzulTestCase):
source_id='sourceId',
source_spec=MagicMock(),
bundle_uuid=MagicMock(),
bundle_version=MagicMock()
bundle_version=MagicMock(),
implicit_hub_id=MagicMock()
)

@property
Expand Down
43 changes: 31 additions & 12 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,12 @@ def _assert_tsv(self, expected: list[tuple[str, ...]], actual: Response):
actual[1:], expected[1:] = sorted(actual[1:]), sorted(expected[1:])
self.assertEqual(expected, actual)

def _read_jsonl(self, response: Response) -> list[JSON]:
return [
json.loads(row)
for row in response.content.decode().splitlines()
]

def _assert_jsonl(self, expected: list[JSON], actual: Response):
"""
Assert that the body of the given response is the expected JSON array,
Expand All @@ -233,10 +239,7 @@ def _assert_jsonl(self, expected: list[JSON], actual: Response):
:param actual: an HTTP response containing JSON objects separated by
newlines
"""
manifest = [
json.loads(row)
for row in actual.content.decode().splitlines()
]
manifest = self._read_jsonl(actual)

def sort_key(row: JSON) -> bytes:
return json_hash(row).digest()
Expand Down Expand Up @@ -2095,14 +2098,30 @@ def test_compact_manifest(self):
self._assert_tsv(expected, response)

def test_verbatim_jsonl_manifest(self):
# Expect no rows from the replica bundle because we aren't filtering
# by dataset ID
self._test_jsonl(filters={},
entities_from_bundles={
'2370f948-2783-aeb6-afea-e022897f4dcf',
'6b0f6c0f-5d80-a242-accb-840921351cd5',
'826dea02-e274-affe-aabc-eb3db63ad068'
})
filters = {}
with self.subTest(filters=filters):
# Expect no rows from the replica bundle because we aren't filtering
# by dataset ID
self._test_jsonl(filters=filters,
entities_from_bundles={
'2370f948-2783-aeb6-afea-e022897f4dcf',
'6b0f6c0f-5d80-a242-accb-840921351cd5',
'826dea02-e274-affe-aabc-eb3db63ad068'
})
filters = {'datasets.title': {'is': ['ANVIL_HPRC']}}
with self.subTest(filters=filters):
# Expect an empty manifest because the filter would only match rows
# from the replica bundle
self._test_jsonl(filters=filters,
entities_from_bundles=set())
filters = {'datasets.dataset_id': {'is': ['59960255-cca9-fcd3-d8ca-eb88f2c72f00']}}
with self.subTest(filters=filters):
# Expect only rows from the replica bundle
self._test_jsonl(filters=filters,
entities_from_bundles=set(),
orphans_from_bundles={
'abc00000-0000-a000-0000-000000000000'
})

def _test_jsonl(self,
*,
Expand Down
3 changes: 2 additions & 1 deletion test/service/test_request_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='sourceId',
source_spec='sourceSpec',
bundle_uuid='bundleUuid',
bundle_version='bundleVersion')
bundle_version='bundleVersion',
implicit_hub_id='projectId')

@property
def _field_mapping(self) -> MetadataPlugin._FieldMapping:
Expand Down

0 comments on commit f31c752

Please sign in to comment.