diff --git a/.gitignore b/.gitignore index 3664c39cc..54f9fccfb 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,8 @@ data/sources/* !data/sources/.gitkeep data/objectives/* !data/objectives/.gitkeep +data/backup/* +!data/backup/.gitkeep .tox/ # coverage reports diff --git a/app/service/data_svc.py b/app/service/data_svc.py index 950915c6a..0169c9486 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -1,8 +1,10 @@ import asyncio import copy +import datetime import glob -import os.path +import os import pickle +import tarfile import shutil import warnings from base64 import b64encode @@ -22,6 +24,18 @@ MIN_MODULE_LEN = 1 +DATA_BACKUP_DIR = "data/backup" +DATA_FILE_GLOBS = ( + 'data/abilities/*', + 'data/adversaries/*', + 'data/facts/*', + 'data/objectives/*', + 'data/payloads/*', + 'data/results/*', + 'data/sources/*', + 'data/object_store', +) + class DataService(DataServiceInterface, BaseService): @@ -31,19 +45,47 @@ def __init__(self): schedules=[], plugins=[], obfuscators=[], objectives=[]) self.ram = copy.deepcopy(self.schema) + @staticmethod + def _iter_data_files(): + """Yield paths to data files managed by caldera. + + The files paths are relative to the root caldera folder, so they + will begin with "data/". + + Note: + This will skip any files starting with '.' (e.g., '.gitkeep'). + """ + for data_glob in DATA_FILE_GLOBS: + for f in glob.glob(data_glob): + yield f + + @staticmethod + def _delete_file(path): + if not os.path.exists(path): + return + elif os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + @staticmethod async def destroy(): - if os.path.exists('data/object_store'): - os.remove('data/object_store') + """Reset the caldera data directory and server state. - for d in ['data/results', 'data/adversaries', 'data/abilities', 'data/facts', 'data/sources', 'data/payloads', 'data/objectives']: - for f in glob.glob('%s/*' % d): - if f.startswith('.'): # e.g., .gitkeep - continue - elif os.path.isdir(f): - shutil.rmtree(f) - else: - os.remove(f) + This creates a gzipped tarball backup of the data files tracked by caldera. + Paths are preserved within the tarball, with all files having "data/" as the + root. + """ + if not os.path.exists(DATA_BACKUP_DIR): + os.mkdir(DATA_BACKUP_DIR) + + timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') + tarball_path = os.path.join(DATA_BACKUP_DIR, f'backup-{timestamp}.tar.gz') + + with tarfile.open(tarball_path, 'w:gz') as tarball: + for file_path in DataService._iter_data_files(): + tarball.add(file_path) + DataService._delete_file(file_path) async def save_state(self): await self._prune_non_critical_data() diff --git a/data/backup/.gitkeep b/data/backup/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/server.py b/server.py index 99c94cafb..c7ba02326 100755 --- a/server.py +++ b/server.py @@ -13,7 +13,7 @@ from app.service.app_svc import AppService from app.service.auth_svc import AuthService from app.service.contact_svc import ContactService -from app.service.data_svc import DataService +from app.service.data_svc import DataService, DATA_BACKUP_DIR from app.service.event_svc import EventService from app.service.file_svc import FileSvc from app.service.learning_svc import LearningService @@ -122,6 +122,7 @@ def list_str(values): init_swagger_documentation(app_svc.application) if args.fresh: + logging.info("Fresh startup: resetting server data. See %s directory for data backups.", DATA_BACKUP_DIR) asyncio.get_event_loop().run_until_complete(data_svc.destroy()) run_tasks(services=app_svc.get_services())