-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Index orphaned replicas (#6626) #6627
Index orphaned replicas (#6626) #6627
Conversation
Codecov ReportAttention: Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## develop #6627 +/- ##
===========================================
+ Coverage 85.50% 85.57% +0.07%
===========================================
Files 155 155
Lines 20758 20874 +116
===========================================
+ Hits 17749 17863 +114
- Misses 3009 3011 +2 ☔ View full report in Codecov by Sentry. |
daf9bfa
to
7e3e665
Compare
d0edc4b
to
d207786
Compare
c8b0767
to
a95de72
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No showstoppers, approved.
For #6691:
Index: test/integration_test.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/test/integration_test.py b/test/integration_test.py
--- a/test/integration_test.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/test/integration_test.py (date 1731020545236)
@@ -1905,7 +1905,11 @@
source = self._choose_source(catalog)
# The plugin will raise an exception if the source lacks a prefix
source = source.with_prefix(Prefix.of_everything)
- bundle_fqids = self.repository_plugin(catalog).list_bundles(source, '')
+ # REVIEW: We had issues with this part of the test being surprisingly
+ # slow. We should make sure that the removal of log statements
+ # from list_bundles doesn't make it harder for use to diagnose
+ # these types of issues. Maybe we should use the client here.
+ bundle_fqids = self.repository_plugin(catalog).list_bundles(source, prefix='')
return self.random.choice(sorted(bundle_fqids))
def _can_bundle(self,
Index: src/azul/plugins/repository/canned/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/repository/canned/__init__.py b/src/azul/plugins/repository/canned/__init__.py
--- a/src/azul/plugins/repository/canned/__init__.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/src/azul/plugins/repository/canned/__init__.py (date 1731019797504)
@@ -26,9 +26,6 @@
from furl import (
furl,
)
-from more_itertools import (
- ilen,
-)
from azul import (
CatalogName,
@@ -165,11 +162,11 @@
def count_bundles(self, source: SOURCE_SPEC) -> int:
staging_area = self.staging_area(source.spec.name)
- return ilen(
- links_id
- for links_id in staging_area.links
- if source.prefix is None or links_id.startswith(source.prefix.common)
- )
+ if source.prefix is None:
+ return len(staging_area.links)
+ else:
+ prefix = source.prefix.common
+ return sum(1 for links_id in staging_area.links if links_id.startswith(prefix))
def list_bundles(self,
source: CannedSourceRef,
Index: src/azul/plugins/metadata/anvil/bundle.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/anvil/bundle.py b/src/azul/plugins/metadata/anvil/bundle.py
--- a/src/azul/plugins/metadata/anvil/bundle.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/src/azul/plugins/metadata/anvil/bundle.py (date 1731025228121)
@@ -130,29 +130,27 @@
pass
def to_json(self) -> MutableJSON:
- def serialize_entities(entities):
+ def to_json(entities):
return {
str(entity_ref): entity
for entity_ref, entity in sorted(entities.items())
}
return {
- 'entities': serialize_entities(self.entities),
- 'orphans': serialize_entities(self.orphans),
+ 'entities': to_json(self.entities),
+ 'orphans': to_json(self.orphans),
'links': [link.to_json() for link in sorted(self.links)]
}
@classmethod
- def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> Self:
- def deserialize_entities(json_entities):
+ def from_json(cls, fqid: BUNDLE_FQID, bundle: JSON) -> Self:
+ def from_json(entities):
return {
EntityReference.parse(entity_ref): entity
- for entity_ref, entity in json_entities.items()
+ for entity_ref, entity in entities.items()
}
- return cls(
- fqid=fqid,
- entities=deserialize_entities(json_['entities']),
- links=set(map(EntityLink.from_json, json_['links'])),
- orphans=deserialize_entities(json_['orphans'])
- )
+ return cls(fqid=fqid,
+ entities=from_json(bundle['entities']),
+ links=set(map(EntityLink.from_json, bundle['links'])),
+ orphans=from_json(bundle['orphans']))
Index: src/azul/plugins/repository/tdr_anvil/__init__.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py
--- a/src/azul/plugins/repository/tdr_anvil/__init__.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/src/azul/plugins/repository/tdr_anvil/__init__.py (date 1731043074073)
@@ -11,7 +11,6 @@
AbstractSet,
Callable,
Iterable,
- Self,
cast,
)
import uuid
@@ -80,63 +79,83 @@
class BundleType(Enum):
"""
- AnVIL snapshots have no inherent notion of a "bundle". During indexing, we
- dynamically construct bundles by querying each table in the snapshot. This
- class enumerates the tables that require special strategies for listing and
- fetching their bundles.
+ Unlike HCA, AnVIL has no inherent notion of a "bundle". Its data model is
+ strictly relational: each row in a table represents an entity, each entity
+ has a primary key, and entities reference each other via a foreign keys.
+ During indexing, we dynamically construct bundles by querying each table in
+ the snapshot. This class enumerates the tables that require special
+ strategies for listing and fetching their bundles.
- Primary bundles are defined by a biosample entity, termed the bundle entity.
- Each primary bundle includes all of the bundle entity descendants and all of
- those those entities' ancestors, which are discovered by iteratively
- following foreign keys. Biosamples were chosen for this role based on a
- desirable balance between the size and number of the resulting bundles as
- well as the degree of overlap between them. The implementation of the graph
- traversal is tightly coupled to this choice, and switching to a different
- entity type would require re-implementing much of the Plugin code. Primary
- bundles consist of at least one biosample (the bundle entity), exactly one
- dataset, and zero or more other entities of assorted types. Primary bundles
+ Primary bundles are defined by a biosample entity, termed the *bundle
+ entity*. Each primary bundle includes all of the bundle entity's descendants
+ and all of those those entities' ancestors. Descendants and ancestors are
+ discovered by iteratively following foreign keys. Biosamples were chosen to
+ act as the bundle entity for primary bundles based on a desirable balance
+ between the size and number of the resulting bundles as well as the degree
+ of overlap between them. The implementation of the graph traversal is
+ tightly coupled to this choice, and switching to a different entity type
+ would require re-implementing much of the Plugin code. Primary bundles
+ consist of at least one biosample (the bundle entity), exactly one dataset
+ entity, and zero or more other entities of assorted types. Primary bundles
never contain orphans because they are bijective to rows in the biosample
table.
Supplementary bundles consist of batches of file entities, which may include
- supplementary files, which lack any foreign keys that associate them with
- any other entity. Non-supplementary files in the bundle are classified as
- orphans. The bundle also includes a dataset entity linked to the
+ supplementary files. The latter lack any foreign keys that would associate
+ them with any other entity. Normal (non-supplementary) files in the bundle
+ are classified as orphans.
+
+ REVIEW: That (above) sounds surprising and may need more explanation.
+
+ Each supplementary bundle also includes the dataset entity linked to the
supplementary files.
- Duos bundles consist of a single dataset entity. This "entity" includes only
+ DUOS bundles consist of a single dataset entity. This "entity" includes only
the dataset description retrieved from DUOS, while a copy of the BigQuery
row for this dataset is also included as an orphan. We chose this design
because there is only one dataset per snapshot, which is referenced in all
primary and supplementary bundles. Therefore, only one request to DUOS per
- *snapshot* is necessary, but if `description` is retrieved at the same time
- as the other dataset fields, we will make one request per *bundle* instead,
- potentially overloading the DUOS service. Our solution is to retrieve
- `description` only in a dedicated bundle format, once per snapshot, and
- merge it with the other dataset fields during aggregation.
+ *snapshot* is necessary. If the DUOS `description` were retrieved at the
+ same time as the other fields of the dataset entity, we would make one
+ request per *bundle* instead, potentially overloading the DUOS service. Our
+ solution is to retrieve `description` only in a bundle of this dedicated
+ DUOS type, once per snapshot, and merge it with the other dataset fields
+ during aggregation.
All other bundles are replica bundles. Replica bundles consist of a batch of
rows from an arbitrary BigQuery table, which may or may not be described by
the AnVIL schema. Replica bundles only include orphans and have no links.
+
+ REVIEW: Confusingly worded. I think what we mean is that the replicas are
+ stored in the `orphans` attribute. We may need to find a new name
+ for that attribute.
"""
primary = 'anvil_biosample'
supplementary = 'anvil_file'
duos = 'anvil_dataset'
- def is_batched(self: Self | str) -> bool:
+ # REVIEW: I'm getting type errors and PyCharm warnings with the original approach
+
+ @classmethod
+ def is_batched(cls, table_name: str) -> bool:
"""
- >>> BundleType.primary.is_batched()
+ True if bundles for the table of the given name represent batches of
+ rows, or if each bundle represents a single row.
+
+ >>> BundleType.primary.is_batched
False
>>> BundleType.is_batched('anvil_activity')
True
"""
- if isinstance(self, str):
- try:
- self = BundleType(self)
- except ValueError:
- return True
- return self not in (BundleType.primary, BundleType.duos)
+ return table_name not in (BundleType.primary.value, BundleType.duos.value)
+
+
+# REVIEW: The change from method to attribute may require more changes at the
+# usage sites
+
+for bundle_type in BundleType:
+ bundle_type.is_batched = BundleType.is_batched(bundle_type.value)
class TDRAnvilBundleFQIDJSON(SourcedBundleFQIDJSON):
@@ -245,28 +264,29 @@
self._assert_source(source)
bundles = []
spec = source.spec
+
if config.duos_service_url is not None:
+ # We intentionally omit the WHERE clause for datasets in order to
+ # verify our assumption that each snapshot only contains rows for a
+ # single dataset. This verification is performed independently and
+ # concurrently for every partition, but only one partition actually
+ # emits the bundle.
row = one(self._run_sql(f'''
SELECT datarepo_row_id
FROM {backtick(self._full_table_name(spec, BundleType.duos.value))}
'''))
dataset_row_id = row['datarepo_row_id']
- # We intentionally omit the WHERE clause for datasets in order
- # to verify our assumption that each snapshot only contains rows
- # for a single dataset. This verification is performed
- # independently and concurrently for every partition, but only
- # one partition actually emits the bundle.
if dataset_row_id.startswith(prefix):
bundle_uuid = change_version(dataset_row_id,
self.datarepo_row_uuid_version,
self.bundle_uuid_version)
- bundles.append(TDRAnvilBundleFQID(
- uuid=bundle_uuid,
- version=self._version,
- source=source,
- table_name=BundleType.duos.value,
- batch_prefix=None,
- ))
+ bundle_fqid = TDRAnvilBundleFQID(uuid=bundle_uuid,
+ version=self._version,
+ source=source,
+ table_name=BundleType.duos.value,
+ batch_prefix=None)
+ bundles.append(bundle_fqid)
+
for row in self._run_sql(f'''
SELECT datarepo_row_id
FROM {backtick(self._full_table_name(spec, BundleType.primary.value))}
@@ -275,24 +295,26 @@
bundle_uuid = change_version(row['datarepo_row_id'],
self.datarepo_row_uuid_version,
self.bundle_uuid_version)
- bundles.append(TDRAnvilBundleFQID(
- uuid=bundle_uuid,
- version=self._version,
- source=source,
- table_name=BundleType.primary.value,
- batch_prefix=None,
- ))
+ bundle_fqid = TDRAnvilBundleFQID(uuid=bundle_uuid,
+ version=self._version,
+ source=source,
+ table_name=BundleType.primary.value,
+ batch_prefix=None)
+ bundles.append(bundle_fqid)
+
prefix_lengths_by_table = self._batch_tables(source.spec, prefix)
for table_name, (batch_prefix_length, _) in prefix_lengths_by_table.items():
batch_prefixes = Prefix(common=prefix,
partition=batch_prefix_length - len(prefix)).partition_prefixes()
for batch_prefix in batch_prefixes:
bundle_uuid = self._batch_uuid(spec, table_name, batch_prefix)
- bundles.append(TDRAnvilBundleFQID(uuid=bundle_uuid,
- version=self._version,
- source=source,
- table_name=table_name,
- batch_prefix=batch_prefix))
+ bundle_fqid = TDRAnvilBundleFQID(uuid=bundle_uuid,
+ version=self._version,
+ source=source,
+ table_name=table_name,
+ batch_prefix=batch_prefix)
+ bundles.append(bundle_fqid)
+
return bundles
def _emulate_bundle(self, bundle_fqid: TDRAnvilBundleFQID) -> TDRAnvilBundle:
@@ -346,6 +368,11 @@
table_names = sorted(filter(BundleType.is_batched, self.tdr.list_tables(source)))
log.info('Calculating batch prefix lengths for partition %r of %d tables '
'in source %s', prefix, len(table_names), source)
+
+ # REVIEW: This needs a FIXME. The respective issue should have a
+ # reproduction, maybe in the form of a diff removing the
+ # workaround, and the resulting unit test failure.
+
# The extraneous outer 'SELECT *' works around a bug in BigQuery emulator
query = ' UNION ALL '.join(f'''(
SELECT * FROM (
Index: src/azul/indexer/index_service.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py
--- a/src/azul/indexer/index_service.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/src/azul/indexer/index_service.py (date 1731044069661)
@@ -212,6 +212,9 @@
for contributions, replicas in transforms:
tallies.update(self.contribute(catalog, contributions))
self.replicate(catalog, replicas)
+
+ # REVIEW: The addition of this conditional seems like an optimization
+ # that is unrelated to the other changes in that commit
if tallies:
self.aggregate(tallies)
@@ -237,6 +240,9 @@
tallies.update(self.contribute(catalog, contributions))
# FIXME: Replica index does not support deletions
# https://github.com/DataBiosphere/azul/issues/5846
+
+ # REVIEW: Should this also be conditional like above?
+
self.aggregate(tallies)
def deep_transform(self,
Index: src/azul/plugins/metadata/anvil/indexer/transform.py
IDEA additional info:
Subsystem: com.intellij.openapi.diff.impl.patch.CharsetEP
<+>UTF-8
===================================================================
diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py
--- a/src/azul/plugins/metadata/anvil/indexer/transform.py (revision 251c79e7791982fef83293ee40f83be8694466ea)
+++ b/src/azul/plugins/metadata/anvil/indexer/transform.py (date 1731025001923)
@@ -169,6 +169,8 @@
assert False, entity_type
def estimate(self, partition: BundlePartition) -> int:
+ # REVIEW: I don't quite understand the pat after "but". *All* orphans will be replicated by one partition?
+
# Orphans are not considered when deciding whether to partition the
# bundle, but if the bundle is partitioned then each orphan will be
# replicated in a single partition
@@ -577,14 +579,16 @@
partition: BundlePartition
) -> Iterable[Contribution | Replica]:
yield from super().transform(partition)
+ # REVIEW: I think *to coalesce* is rarely seen in passive tense, as in
+ # "The cells are coalesced" but rather "The cells coalesce"
if config.enable_replicas:
- # Replicas are only emitted by the file transformer for entities
- # that are linked to at least one file. This excludes all orphans,
- # and a small number of linked entities, usually from primary
- # bundles don't include any files. Some of the replicas we emit here
- # will be redundant with those emitted by the file transformer, but
- # these will be coalesced by the index service before they are
- # written to ElasticSearch.
+ # The file transformer only emits replicas for entities that are
+ # linked to at least one file. This excludes all orphans, and a
+ # small number of linked entities, usually from primary bundles that
+ # don't include any files. Some of the replicas we emit here will be
+ # redundant with those emitted by the file transformer, but these
+ # will be consolidated by the index service before they are written
+ # to ElasticSearch.
dataset = self._only_dataset()
for entity in chain(self.bundle.orphans, self.bundle.entities):
if partition.contains(UUID(entity.entity_id)):
Security design review
|
251c79e
to
043298e
Compare
24a106a
to
2c7de0b
Compare
2704c7b
to
9b6cf31
Compare
Connected issues: #6626
Checklist
Author
develop
issues/<GitHub handle of author>/<issue#>-<slug>
1 when the issue title describes a problem, the corresponding PR
title is
Fix:
followed by the issue titleAuthor (partiality)
p
tag to titles of partial commitspartial
or completely resolves all connected issuespartial
labelAuthor (chains)
base
or this PR is not chained to another PRchained
or is not chained to another PRAuthor (reindex, API changes)
r
tag to commit title or the changes introduced by this PR will not require reindexing of any deploymentreindex:dev
or the changes introduced by it will not require reindexing ofdev
reindex:anvildev
or the changes introduced by it will not require reindexing ofanvildev
reindex:anvilprod
or the changes introduced by it will not require reindexing ofanvilprod
reindex:prod
or the changes introduced by it will not require reindexing ofprod
reindex:partial
and its description documents the specific reindexing procedure fordev
,anvildev
,anvilprod
andprod
or requires a full reindex or carries none of the labelsreindex:dev
,reindex:anvildev
,reindex:anvilprod
andreindex:prod
API
or this PR does not modify a REST APIa
(A
) tag to commit title for backwards (in)compatible changes or this PR does not modify a REST APIapp.py
or this PR does not modify a REST APIAuthor (upgrading deployments)
make docker_images.json
and committed the resulting changes or this PR does not modifyazul_docker_images
, or any other variables referenced in the definition of that variableu
tag to commit title or this PR does not require upgrading deploymentsupgrade
or does not require upgrading deploymentsdeploy:shared
or does not modifydocker_images.json
, and does not require deploying theshared
component for any other reasondeploy:gitlab
or does not require deploying thegitlab
componentdeploy:runner
or does not require deploying therunner
imageAuthor (hotfixes)
F
tag to main commit title or this PR does not include permanent fix for a temporary hotfixanvilprod
andprod
) have temporary hotfixes for any of the issues connected to this PRAuthor (before every review)
develop
, squashed old fixupsmake requirements_update
or this PR does not modifyrequirements*.txt
,common.mk
,Makefile
andDockerfile
R
tag to commit title or this PR does not modifyrequirements*.txt
reqs
or does not modifyrequirements*.txt
make integration_test
passes in personal deployment or this PR does not modify functionality that could affect the IT outcomePeer reviewer (after approval)
System administrator (after approval)
demo
orno demo
no demo
no sandbox
N reviews
label is accurateOperator (before pushing merge the commit)
reindex:…
labels andr
commit title tagno demo
develop
_select dev.shared && CI_COMMIT_REF_NAME=develop make -C terraform/shared apply_keep_unused
or this PR is not labeleddeploy:shared
_select dev.gitlab && CI_COMMIT_REF_NAME=develop make -C terraform/gitlab apply
or this PR is not labeleddeploy:gitlab
_select anvildev.shared && CI_COMMIT_REF_NAME=develop make -C terraform/shared apply_keep_unused
or this PR is not labeleddeploy:shared
_select anvildev.gitlab && CI_COMMIT_REF_NAME=develop make -C terraform/gitlab apply
or this PR is not labeleddeploy:gitlab
deploy:gitlab
deploy:gitlab
System administrator
dev.gitlab
are complete or this PR is not labeleddeploy:gitlab
anvildev.gitlab
are complete or this PR is not labeleddeploy:gitlab
Operator (before pushing merge the commit)
_select dev.gitlab && make -C terraform/gitlab/runner
or this PR is not labeleddeploy:runner
_select anvildev.gitlab && make -C terraform/gitlab/runner
or this PR is not labeleddeploy:runner
sandbox
label or PR is labeledno sandbox
dev
or PR is labeledno sandbox
anvildev
or PR is labeledno sandbox
sandbox
deployment or PR is labeledno sandbox
anvilbox
deployment or PR is labeledno sandbox
sandbox
deployment or PR is labeledno sandbox
anvilbox
deployment or PR is labeledno sandbox
sandbox
or this PR does not remove catalogs or otherwise causes unreferenced indices indev
anvilbox
or this PR does not remove catalogs or otherwise causes unreferenced indices inanvildev
sandbox
or this PR is not labeledreindex:dev
anvilbox
or this PR is not labeledreindex:anvildev
sandbox
or this PR is not labeledreindex:dev
anvilbox
or this PR is not labeledreindex:anvildev
p
if the PR is also labeledpartial
Operator (chain shortening)
develop
or this PR is not labeledbase
chained
label from the blocked PR or this PR is not labeledbase
base
base
label from this PR or this PR is not labeledbase
Operator (after pushing the merge commit)
dev
anvildev
dev
dev
anvildev
anvildev
_select dev.shared && make -C terraform/shared apply
or this PR is not labeleddeploy:shared
_select anvildev.shared && make -C terraform/shared apply
or this PR is not labeleddeploy:shared
dev
anvildev
Operator (reindex)
dev
or this PR is neither labeledreindex:partial
norreindex:dev
anvildev
or this PR is neither labeledreindex:partial
norreindex:anvildev
dev
or this PR is neither labeledreindex:partial
norreindex:dev
anvildev
or this PR is neither labeledreindex:partial
norreindex:anvildev
dev
or this PR is neither labeledreindex:partial
norreindex:dev
anvildev
or this PR is neither labeledreindex:partial
norreindex:anvildev
dev
or this PR does not require reindexingdev
anvildev
or this PR does not require reindexinganvildev
dev
or this PR does not require reindexingdev
anvildev
or this PR does not require reindexinganvildev
dev
or this PR does not require reindexingdev
anvildev
or this PR does not require reindexinganvildev
Operator
deploy:shared
,deploy:gitlab
,deploy:runner
,API
,reindex:partial
,reindex:anvilprod
andreindex:prod
labels to the next promotion PRs or this PR carries none of these labelsdeploy:shared
,deploy:gitlab
,deploy:runner
,API
,reindex:partial
,reindex:anvilprod
andreindex:prod
labels, from the description of this PR to that of the next promotion PRs or this PR carries none of these labelsShorthand for review comments
L
line is too longW
line wrapping is wrongQ
bad quotesF
other formatting problem