Skip to content

Commit

Permalink
Include orphans in manifest when filtering by only project/dataset (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 7, 2024
1 parent 74fdd4d commit 54e6836
Show file tree
Hide file tree
Showing 11 changed files with 51 additions and 14 deletions.
3 changes: 3 additions & 0 deletions src/azul/indexer/document_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,6 @@ def translate_fields(self,
self.field_types(catalog),
forward=forward,
allowed_paths=allowed_paths)

def always_limit_access(self) -> bool:
return True
1 change: 1 addition & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ class SpecialFields:
source_spec: FieldName
bundle_uuid: FieldName
bundle_version: FieldName
implicit_hub_id: FieldName


class ManifestFormat(Enum):
Expand Down
12 changes: 10 additions & 2 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='source_id',
source_spec='source_spec',
bundle_uuid='bundle_uuid',
bundle_version='bundle_version')
bundle_version='bundle_version',
implicit_hub_id='datasets.dataset_id')

@property
def implicit_hub_type(self) -> str:
Expand Down Expand Up @@ -330,7 +331,14 @@ def verbatim_pfb_schema(self,
schema['name']: schema
for schema in anvil_schema['tables']
}
entity_schemas = []
non_schema_replicas = [
r for r in replicas
if r['replica_type'] not in table_schemas_by_name
]
# For tables not described by the AnVIL schema, fall back to building
# their PFB schema dynamically from the shapes of the replicas
entity_schemas = super().verbatim_pfb_schema(non_schema_replicas)
# For the rest, use the AnVIL schema as the basis of the PFB schema
for table_name, table_schema in table_schemas_by_name.items():
# FIXME: Improve handling of DUOS replicas
# https://github.com/DataBiosphere/azul/issues/6139
Expand Down
2 changes: 1 addition & 1 deletion src/azul/plugins/metadata/anvil/service/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
class AnvilFilterStage(FilterStage):

def _limit_access(self) -> bool:
return self.entity_type != 'datasets'
return self.service.always_limit_access() or self.entity_type != 'datasets'
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='sourceId',
source_spec='sourceSpec',
bundle_uuid='bundleUuid',
bundle_version='bundleVersion')
bundle_version='bundleVersion',
implicit_hub_id='projectId')

@property
def implicit_hub_type(self) -> str:
Expand Down
2 changes: 1 addition & 1 deletion src/azul/plugins/metadata/hca/service/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
class HCAFilterStage(FilterStage):

def _limit_access(self) -> bool:
return self.entity_type != 'projects'
return self.service.always_limit_access() or self.entity_type != 'projects'
12 changes: 10 additions & 2 deletions src/azul/service/manifest_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1985,7 +1985,7 @@ class VerbatimManifestGenerator(FileBasedManifestGenerator, metaclass=ABCMeta):

@property
def entity_type(self) -> str:
return 'files'
return self.implicit_hub_type if self.include_orphans else 'files'

@property
def included_fields(self) -> list[FieldPath]:
Expand All @@ -2001,6 +2001,11 @@ def included_fields(self) -> list[FieldPath]:
def implicit_hub_type(self) -> str:
return self.service.metadata_plugin(self.catalog).implicit_hub_type

@property
def include_orphans(self) -> bool:
special_fields = self.service.metadata_plugin(self.catalog).special_fields
return self.filters.explicit.keys() == {special_fields.implicit_hub_id}

@attrs.frozen(kw_only=True)
class ReplicaKeys:
"""
Expand All @@ -2019,8 +2024,11 @@ def _replica_keys(self) -> Iterable[ReplicaKeys]:
hub_type = self.implicit_hub_type
request = self._create_request()
for hit in request.scan():
replica_id = one(hit['contents'][hub_type])['document_id']
if self.entity_type != hub_type:
replica_id = one(replica_id)
yield self.ReplicaKeys(hub_id=hit['entity_id'],
replica_id=one(one(hit['contents'][hub_type])['document_id']))
replica_id=replica_id)

def _all_replicas(self) -> Iterable[JSON]:
emitted_replica_ids = set()
Expand Down
3 changes: 3 additions & 0 deletions src/azul/service/repository_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -374,3 +374,6 @@ def _hit_to_doc(hit: Hit) -> JSON:
if file_version is not None:
assert file_version == file['version']
return file

def always_limit_access(self) -> bool:
return False
3 changes: 2 additions & 1 deletion test/service/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ class TestFilterReification(AzulTestCase):
source_id='sourceId',
source_spec=MagicMock(),
bundle_uuid=MagicMock(),
bundle_version=MagicMock()
bundle_version=MagicMock(),
implicit_hub_id=MagicMock()
)

@property
Expand Down
21 changes: 16 additions & 5 deletions test/service/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,7 @@ def test_compact_manifest(self):

def test_verbatim_jsonl_manifest(self):
linked_entities_by_hash = {}
all_entities_by_hash = {}

def unique_rows(entities: Mapping[EntityReference, JSON]
) -> Iterable[tuple[str, JSON]]:
Expand All @@ -2101,11 +2102,21 @@ def unique_rows(entities: Mapping[EntityReference, JSON]
for bundle_fqid in self.bundles():
bundle = self._load_canned_bundle(bundle_fqid)
linked_entities_by_hash.update(unique_rows(bundle.entities))

response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters={})
self.assertEqual(200, response.status_code)
expected_rows = list(linked_entities_by_hash.values())
self._assert_jsonl(expected_rows, response)
all_entities_by_hash.update(unique_rows(bundle.orphans))
all_entities_by_hash.update(linked_entities_by_hash)

for filters, expect_orphans in [
({}, False),
({'datasets.title': {'is': ['ANVIL_CMG_UWASH_DS_BDIS']}}, False),
# Orphans should be included only when filtering by dataset ID
({'datasets.dataset_id': {'is': ['52ee7665-7033-63f2-a8d9-ce8e32666739']}}, True)
]:
with self.subTest(filters=filters):
response = self._get_manifest(ManifestFormat.verbatim_jsonl, filters=filters)
self.assertEqual(200, response.status_code)
expected_rows = list((all_entities_by_hash if expect_orphans
else linked_entities_by_hash).values())
self._assert_jsonl(expected_rows, response)

def test_verbatim_pfb_manifest(self):
response = self._get_manifest(ManifestFormat.verbatim_pfb, filters={})
Expand Down
3 changes: 2 additions & 1 deletion test/service/test_request_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def special_fields(self) -> SpecialFields:
return SpecialFields(source_id='sourceId',
source_spec='sourceSpec',
bundle_uuid='bundleUuid',
bundle_version='bundleVersion')
bundle_version='bundleVersion',
implicit_hub_id='projectId')

@property
def _field_mapping(self) -> MetadataPlugin._FieldMapping:
Expand Down

0 comments on commit 54e6836

Please sign in to comment.