Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD: New repository CLI #4965

Merged
merged 19 commits into from
Dec 9, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ repos:
pass_filenames: true
files: >-
(?x)^(
aiida/backends/control.py|
aiida/common/progress_reporter.py|
aiida/engine/.*py|
aiida/manage/manager.py|
Expand Down
15 changes: 15 additions & 0 deletions aiida/backends/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,21 @@
###########################################################################
"""Module for implementations of database backends."""

# AUTO-GENERATED

# yapf: disable
# pylint: disable=wildcard-import

from .control import *

__all__ = (
'MAINTAIN_LOGGER',
)

# yapf: enable

# END AUTO-GENERATED

BACKEND_DJANGO = 'django'
BACKEND_SQLA = 'sqlalchemy'

Expand Down
100 changes: 100 additions & 0 deletions aiida/backends/control.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
ramirezfranciscof marked this conversation as resolved.
Show resolved Hide resolved
###########################################################################
# Copyright (c), The AiiDA team. All rights reserved. #
# This file is part of the AiiDA code. #
# #
# The code is hosted on GitHub at https://github.com/aiidateam/aiida-core #
# For further information on the license, see the LICENSE.txt file #
# For further information please visit http://www.aiida.net #
###########################################################################
"""Module for overall repository control commands."""
# Note: these functions are not methods of `AbstractRepositoryBackend` because they need access to the orm.
# This is because they have to go through all the nodes to gather the list of keys that AiiDA is keeping
# track of (since they are descentralized in each node entry).
# See the get_unreferenced_keyset function
from typing import TYPE_CHECKING, Optional, Set

from aiida.common.log import AIIDA_LOGGER
from aiida.manage.manager import get_manager
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from aiida.manage.manager import get_manager
from typing import Optional, Set
from aiida.manage.manager import get_manager


if TYPE_CHECKING:
from aiida.orm.implementation import Backend

__all__ = ('MAINTAIN_LOGGER',)
sphuber marked this conversation as resolved.
Show resolved Hide resolved

MAINTAIN_LOGGER = AIIDA_LOGGER.getChild('maintain')
sphuber marked this conversation as resolved.
Show resolved Hide resolved


def repository_maintain(
full: bool = False,
dry_run: bool = False,
backend: Optional['Backend'] = None,
**kwargs,
) -> None:
"""Performs maintenance tasks on the repository.

:param full:
flag to perform operations that require to stop using the maintained profile.

:param dry_run:
flag to only print the actions that would be taken without actually executing them.

:param backend:
specific backend in which to apply the maintenance (defaults to current profile).
"""

if backend is None:
backend = get_manager().get_backend()
repository = backend.get_repository()

unreferenced_objects = get_unreferenced_keyset(aiida_backend=backend)
MAINTAIN_LOGGER.info(f'Deleting {len(unreferenced_objects)} unreferenced objects ...')
if not dry_run:
repository.delete_objects(list(unreferenced_objects))

MAINTAIN_LOGGER.info('Starting repository-specific operations ...')
repository.maintain(live=not full, dry_run=dry_run, **kwargs)


def get_unreferenced_keyset(check_consistency: bool = True, aiida_backend: Optional['Backend'] = None) -> Set[str]:
"""Returns the keyset of objects that exist in the repository but are not tracked by AiiDA.
ramirezfranciscof marked this conversation as resolved.
Show resolved Hide resolved

This should be all the soft-deleted files.

:param check_consistency:
toggle for a check that raises if there are references in the database with no actual object in the
underlying repository.

:param aiida_backend:
specific backend in which to apply the operation (defaults to current profile).

:return:
a set with all the objects in the underlying repository that are not referenced in the database.
"""
from aiida import orm
MAINTAIN_LOGGER.info('Obtaining unreferenced object keys ...')

if aiida_backend is None:
aiida_backend = get_manager().get_backend()

repository = aiida_backend.get_repository()

keyset_repository = set(repository.list_objects())
keyset_database = set(orm.Node.objects(aiida_backend).iter_repo_keys())

if check_consistency:
keyset_missing = keyset_database - keyset_repository
if len(keyset_missing) > 0:
raise RuntimeError(
'There are objects referenced in the database that are not present in the repository. Aborting!'
)

return keyset_repository - keyset_database


def get_repository_info(statistics: bool = False, backend: Optional['Backend'] = None) -> dict:
"""Returns general information on the repository."""
if backend is None:
backend = get_manager().get_backend()
repository = backend.get_repository()
return repository.get_info(statistics)
6 changes: 6 additions & 0 deletions aiida/backends/general/migrations/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ def list_objects(self) -> Iterable[str]:
def iter_object_streams(self, keys: List[str]):
raise NotImplementedError()

def maintain(self, dry_run: bool = False, live: bool = True, **kwargs) -> None:
raise NotImplementedError

def get_info(self, statistics: bool = False, **kwargs) -> dict:
raise NotImplementedError


def migrate_legacy_repository(shard=None):
"""Migrate the legacy file repository to the new disk object store and return mapping of repository metadata.
Expand Down
52 changes: 51 additions & 1 deletion aiida/cmdline/commands/cmd_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,58 @@ def storage_integrity():
@click.option('--statistics', is_flag=True, help='Provides more in-detail statistically relevant data.')
def storage_info(statistics):
ramirezfranciscof marked this conversation as resolved.
Show resolved Hide resolved
"""Summarise the contents of the storage."""
from aiida.backends.control import get_repository_info
from aiida.cmdline.utils.common import get_database_summary
from aiida.orm import QueryBuilder

data = get_database_summary(QueryBuilder, statistics)
data = {
'database': get_database_summary(QueryBuilder, statistics),
'repository': get_repository_info(statistics=statistics),
}

echo.echo_dictionary(data, sort_keys=False, fmt='yaml')


@verdi_storage.command('maintain')
@click.option(
'--full',
is_flag=True,
help='Perform all maintenance tasks, including the ones that should not be executed while the profile is in use.'
)
@click.option(
'--dry-run',
is_flag=True,
help=
'Run the maintenance in dry-run mode which will print actions that would be taken without actually executing them.'
)
def storage_maintain(full, dry_run):
"""Performs maintenance tasks on the repository."""
from aiida.backends.control import repository_maintain

if full:
echo.echo_warning(
'\nIn order to safely perform the full maintenance operations on the internal storage, no other '
'process should be using the AiiDA profile being maintained. '
'This includes daemon workers, verdi shells, scripts with the profile loaded, etc). '
'Please make sure there is nothing like this currently running and that none is started until '
'these procedures conclude. '
'For performing maintanance operations that are safe to run while actively using AiiDA, just run '
'`verdi storage maintain`, without the `--full` flag.\n'
)

else:
echo.echo(
'\nThis command will perform all maintenance operations on the internal storage that can be safely '
'executed while still running AiiDA. '
'However, not all operations that are required to fully optimize disk usage and future performance '
'can be done in this way. '
'Whenever you find the time or opportunity, please consider running `verdi repository maintenance '
'--full` for a more complete optimization.\n'
)

if not dry_run:
if not click.confirm('Are you sure you want continue in this mode?'):
return

repository_maintain(full=full, dry_run=dry_run)
echo.echo_success('Requested maintenance procedures finished.')
23 changes: 23 additions & 0 deletions aiida/repository/backend/abstract.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,29 @@ def list_objects(self) -> Iterable[str]:
:return: An iterable for all the available object keys.
"""

@abc.abstractmethod
def get_info(self, statistics: bool = False, **kwargs) -> dict:
"""Returns relevant information about the content of the repository.

:param statistics:
flag to enable extra information (statistics=False by default, only returns basic information).

:return: a dictionary with the information.
"""

@abc.abstractmethod
def maintain(self, dry_run: bool = False, live: bool = True, **kwargs) -> None:
"""Performs maintenance operations.

:param dry_run:
flag to only print the actions that would be taken without actually executing them.

:param live:
flag to indicate to the backend whether AiiDA is live or not (i.e. if the profile of the
backend is currently being used/accessed). The backend is expected then to only allow (and
thus set by default) the operations that are safe to perform in this state.
"""

@contextlib.contextmanager
def open(self, key: str) -> Iterator[BinaryIO]:
"""Open a file handle to an object stored under the given key.
Expand Down
95 changes: 95 additions & 0 deletions aiida/repository/backend/disk_object_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

__all__ = ('DiskObjectStoreRepositoryBackend',)

BYTES_TO_MB = 1 / 1024**2


class DiskObjectStoreRepositoryBackend(AbstractRepositoryBackend):
"""Implementation of the ``AbstractRepositoryBackend`` using the ``disk-object-store`` as the backend."""
Expand Down Expand Up @@ -118,3 +120,96 @@ def get_object_hash(self, key: str) -> str:
if self.container.hash_type != 'sha256':
return super().get_object_hash(key)
return key

def maintain( # type: ignore # pylint: disable=arguments-differ,too-many-branches
self,
dry_run: bool = False,
live: bool = True,
pack_loose: bool = None,
do_repack: bool = None,
clean_storage: bool = None,
do_vacuum: bool = None,
) -> dict:
"""Performs maintenance operations.

:param live:
if True, will only perform operations that are safe to do while the repository is in use.
:param pack_loose:
flag for forcing the packing of loose files.
:param do_repack:
flag for forcing the re-packing of already packed files.
:param clean_storage:
flag for forcing the cleaning of soft-deleted files from the repository.
:param do_vacuum:
flag for forcing the vacuuming of the internal database when cleaning the repository.
:return:
a dictionary with information on the operations performed.
"""
from aiida.backends.control import MAINTAIN_LOGGER
DOSTORE_LOGGER = MAINTAIN_LOGGER.getChild('disk_object_store') # pylint: disable=invalid-name

if live and (do_repack or clean_storage or do_vacuum):
overrides = {'do_repack': do_repack, 'clean_storage': clean_storage, 'do_vacuum': do_vacuum}
keys = ', '.join([key for key, override in overrides if override is True]) # type: ignore
raise ValueError(f'The following overrides were enabled but cannot be if `live=True`: {keys}')

pack_loose = True if pack_loose is None else pack_loose
sphuber marked this conversation as resolved.
Show resolved Hide resolved

if live:
do_repack = False
clean_storage = False
do_vacuum = False
else:
do_repack = True if do_repack is None else do_repack
clean_storage = True if clean_storage is None else clean_storage
do_vacuum = True if do_vacuum is None else do_vacuum

if pack_loose:
files_numb = self.container.count_objects()['loose']
files_size = self.container.get_total_size()['total_size_loose'] * BYTES_TO_MB
DOSTORE_LOGGER.report(f'Packing all loose files ({files_numb} files occupying {files_size} MB) ...')
if not dry_run:
self.container.pack_all_loose()

if do_repack:
files_numb = self.container.count_objects()['packed']
files_size = self.container.get_total_size()['total_size_packfiles_on_disk'] * BYTES_TO_MB
Comment on lines +175 to +176
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This information is not really adding anything specific to the maintenance operation is it? It just gives the current size, but that doesn't tell what it will be nor what will be saved. Only the latter would be really interesting IMO

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, it can help give you an idea of how long it might take to do the repacking. But ok, I can take it out if you prefer.

DOSTORE_LOGGER.report(
f'Re-packing all pack files ({files_numb} files in packs, occupying {files_size} MB) ...'
)
if not dry_run:
self.container.repack()

if clean_storage:
DOSTORE_LOGGER.report(f'Cleaning the repository database (with `vacuum={do_vacuum}`) ...')
if not dry_run:
self.container.clean_storage(vacuum=do_vacuum)


def get_info( # type: ignore # pylint: disable=arguments-differ
self,
statistics=False,
) -> dict:

output_info = {}
output_info['SHA-hash algorithm'] = self.container.hash_type
output_info['Compression algorithm'] = self.container.compression_algorithm

if not statistics:
return output_info

files_data = self.container.count_objects()
size_data = self.container.get_total_size()

output_info['Packs'] = files_data['pack_files'] # type: ignore

output_info['Objects'] = { # type: ignore
'unpacked': files_data['loose'],
'packed': files_data['packed'],
}
output_info['Size (MB)'] = { # type: ignore
'unpacked': size_data['total_size_loose'] * BYTES_TO_MB,
'packed': size_data['total_size_packfiles_on_disk'] * BYTES_TO_MB,
'other': size_data['total_size_packindexes_on_disk'] * BYTES_TO_MB,
}
return output_info
6 changes: 6 additions & 0 deletions aiida/repository/backend/sandbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,3 +115,9 @@ def delete_objects(self, keys: List[str]) -> None:

def list_objects(self) -> Iterable[str]:
return self.sandbox.get_content_list()

def maintain(self, dry_run: bool = False, live: bool = True, **kwargs) -> None:
raise NotImplementedError

def get_info(self, statistics: bool = False, **kwargs) -> dict:
raise NotImplementedError
6 changes: 6 additions & 0 deletions aiida/tools/archive/implementations/sqlite/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ def delete_objects(self, keys: List[str]) -> None:
def get_object_hash(self, key: str) -> str:
return key

def maintain(self, dry_run: bool = False, live: bool = True, **kwargs) -> None:
raise NotImplementedError

def get_info(self, statistics: bool = False, **kwargs) -> dict:
raise NotImplementedError


class ArchiveBackendQueryBuilder(SqlaQueryBuilder):
"""Archive query builder"""
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/command_line.rst
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,7 @@ Below is a list with all available subcommands.
Commands:
info Summarise the contents of the storage.
integrity Checks for the integrity of the data storage.
maintain Performs maintenance tasks on the repository.
migrate Migrate the storage to the latest schema version.


Expand Down
Loading