From de5f66038c5bc31e11c5f25158a8ffa9d24149e6 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Thu, 1 Apr 2021 09:52:46 -0400 Subject: [PATCH 1/7] --fresh moves files aside rather than deleting them. This is to make it possible to restore server state after an accidental --fresh start. --- .gitignore | 1 + app/service/data_svc.py | 53 +++++++++++++++++++++++++++++++++-------- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index 3664c39cc..6a97cd038 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,7 @@ data/sources/* !data/sources/.gitkeep data/objectives/* !data/objectives/.gitkeep +data/backup/* .tox/ # coverage reports diff --git a/app/service/data_svc.py b/app/service/data_svc.py index 950915c6a..2f4af51fd 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -1,7 +1,7 @@ import asyncio import copy import glob -import os.path +import os import pickle import shutil import warnings @@ -22,6 +22,18 @@ MIN_MODULE_LEN = 1 +DEFAULT_BACKUP_DIR = "data/backup" +DEFAULT_DATA_FILE_GLOBS = ( + 'data/abilities/*', + 'data/adversaries/*', + 'data/facts/*', + 'data/objectives/*' + 'data/payloads/*', + 'data/results/*', + 'data/sources/*', + 'data/object_store', +) + class DataService(DataServiceInterface, BaseService): @@ -32,18 +44,39 @@ def __init__(self): self.ram = copy.deepcopy(self.schema) @staticmethod - async def destroy(): - if os.path.exists('data/object_store'): - os.remove('data/object_store') + def _iter_data_files(): + """Yield paths to data files managed by caldera. - for d in ['data/results', 'data/adversaries', 'data/abilities', 'data/facts', 'data/sources', 'data/payloads', 'data/objectives']: - for f in glob.glob('%s/*' % d): + The files paths are relative to the root caldera folder, so they + will begin with "data/". + """ + for data_glob in DEFAULT_DATA_FILE_GLOBS: + for f in glob.glob(data_glob): if f.startswith('.'): # e.g., .gitkeep continue - elif os.path.isdir(f): - shutil.rmtree(f) - else: - os.remove(f) + yield f + + @staticmethod + async def destroy(backup_dir=DEFAULT_BACKUP_DIR): + """Clear the caldera data directory and server state. + + This moves all data files and the object store to the specified + backup directory. The original data file paths are preserved + under the backup directory. + + Example (original path -> new backup path): + data/results/23deddf-ff3f2.yml -> backup/data/results/23deddf-ff3f2.yml + """ + if os.path.exists(backup_dir): + shutil.rmtree(backup_dir) + + os.mkdir(backup_dir) + + for file_path in DataService._iter_data_files(): + src_dir, src_filename = os.path.split(file_path) + dst_dir = os.path.join(backup_dir, src_dir) + os.makedirs(dst_dir, exist_ok=True) + shutil.move(file_path, os.path.join(dst_dir, src_filename)) async def save_state(self): await self._prune_non_critical_data() From a1f26112be4e9ef1b8bb38bf9aa4bb05f520ff18 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Thu, 1 Apr 2021 14:05:08 -0400 Subject: [PATCH 2/7] Add backup/.gitkeep and refactor destroy() to be aware of it. --- app/service/data_svc.py | 45 ++++++++++++++++++++++++++++++----------- data/backup/.gitkeep | 0 2 files changed, 33 insertions(+), 12 deletions(-) create mode 100644 data/backup/.gitkeep diff --git a/app/service/data_svc.py b/app/service/data_svc.py index 2f4af51fd..693487d32 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -22,12 +22,12 @@ MIN_MODULE_LEN = 1 -DEFAULT_BACKUP_DIR = "data/backup" -DEFAULT_DATA_FILE_GLOBS = ( +DATA_BACKUP_DIR = "data/backup" +DATA_FILE_GLOBS = ( 'data/abilities/*', 'data/adversaries/*', 'data/facts/*', - 'data/objectives/*' + 'data/objectives/*', 'data/payloads/*', 'data/results/*', 'data/sources/*', @@ -49,15 +49,36 @@ def _iter_data_files(): The files paths are relative to the root caldera folder, so they will begin with "data/". + + Note: + This will skip any files starting with '.' (e.g., 'gitkeep'). """ - for data_glob in DEFAULT_DATA_FILE_GLOBS: + for data_glob in DATA_FILE_GLOBS: for f in glob.glob(data_glob): - if f.startswith('.'): # e.g., .gitkeep - continue yield f @staticmethod - async def destroy(backup_dir=DEFAULT_BACKUP_DIR): + def _delete_directory_contents(path): + """Delete all files and subdirectories under `path`. + + Note: + This uses `glob` and thus, ignores files files that + start with a '.' (e.g., '.gitkeep') + """ + if not os.path.exists(path): + return + + if not os.path.isdir(path): + raise ValueError(f"Input path must be a directory. Received {path}") + + for path in glob.glob(f"{path}/*"): + if os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) + + @staticmethod + async def destroy(): """Clear the caldera data directory and server state. This moves all data files and the object store to the specified @@ -67,14 +88,14 @@ async def destroy(backup_dir=DEFAULT_BACKUP_DIR): Example (original path -> new backup path): data/results/23deddf-ff3f2.yml -> backup/data/results/23deddf-ff3f2.yml """ - if os.path.exists(backup_dir): - shutil.rmtree(backup_dir) - - os.mkdir(backup_dir) + if os.path.exists(DATA_BACKUP_DIR): + DataService._delete_directory_contents(DATA_BACKUP_DIR) + else: + os.mkdir(DATA_BACKUP_DIR) for file_path in DataService._iter_data_files(): src_dir, src_filename = os.path.split(file_path) - dst_dir = os.path.join(backup_dir, src_dir) + dst_dir = os.path.join(DATA_BACKUP_DIR, src_dir) os.makedirs(dst_dir, exist_ok=True) shutil.move(file_path, os.path.join(dst_dir, src_filename)) diff --git a/data/backup/.gitkeep b/data/backup/.gitkeep new file mode 100644 index 000000000..e69de29bb From 0d5918ebc3d24e78ef12eebc81ed04d3ab7b1aa9 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Thu, 1 Apr 2021 14:06:49 -0400 Subject: [PATCH 3/7] Don't ignore data/backup/.gitkeep --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 6a97cd038..54f9fccfb 100644 --- a/.gitignore +++ b/.gitignore @@ -30,6 +30,7 @@ data/sources/* data/objectives/* !data/objectives/.gitkeep data/backup/* +!data/backup/.gitkeep .tox/ # coverage reports From bbf5bb550ab5a9317c7a5773b29cf48c7827cf13 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Fri, 2 Apr 2021 10:09:44 -0400 Subject: [PATCH 4/7] Added info statement for --fresh on server startup. --- server.py | 1 + 1 file changed, 1 insertion(+) diff --git a/server.py b/server.py index 99c94cafb..ed74ce01f 100755 --- a/server.py +++ b/server.py @@ -122,6 +122,7 @@ def list_str(values): init_swagger_documentation(app_svc.application) if args.fresh: + logging.info("Fresh startup: removing server data files") asyncio.get_event_loop().run_until_complete(data_svc.destroy()) run_tasks(services=app_svc.get_services()) From 463944aba8ffb8de656f0b6b1bfd177684e05bb6 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Fri, 2 Apr 2021 10:11:58 -0400 Subject: [PATCH 5/7] Comment cleanup --- app/service/data_svc.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/app/service/data_svc.py b/app/service/data_svc.py index 693487d32..d6e9172d4 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -51,7 +51,7 @@ def _iter_data_files(): will begin with "data/". Note: - This will skip any files starting with '.' (e.g., 'gitkeep'). + This will skip any files starting with '.' (e.g., '.gitkeep'). """ for data_glob in DATA_FILE_GLOBS: for f in glob.glob(data_glob): @@ -62,8 +62,9 @@ def _delete_directory_contents(path): """Delete all files and subdirectories under `path`. Note: - This uses `glob` and thus, ignores files files that - start with a '.' (e.g., '.gitkeep') + This uses `glob` and thus, ignores top-level files + that start with a '.' (e.g., '.gitkeep'. Any subdirectories + are deleted entirely (even if they contain '.' files). """ if not os.path.exists(path): return @@ -81,9 +82,8 @@ def _delete_directory_contents(path): async def destroy(): """Clear the caldera data directory and server state. - This moves all data files and the object store to the specified - backup directory. The original data file paths are preserved - under the backup directory. + This moves all data files and the object store to the data backup directory. + The original data file paths are preserved under the backup directory. Example (original path -> new backup path): data/results/23deddf-ff3f2.yml -> backup/data/results/23deddf-ff3f2.yml From 7a1809df1b0dabdbe582f5ffae6d19879dfd5c54 Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Fri, 2 Apr 2021 10:48:44 -0400 Subject: [PATCH 6/7] Prefer single quotes for string literals. --- app/service/data_svc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/app/service/data_svc.py b/app/service/data_svc.py index d6e9172d4..39e8ad30a 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -70,9 +70,9 @@ def _delete_directory_contents(path): return if not os.path.isdir(path): - raise ValueError(f"Input path must be a directory. Received {path}") + raise ValueError(f'Input path must be a directory. Received {path}') - for path in glob.glob(f"{path}/*"): + for path in glob.glob(f'{path}/*'): if os.path.isdir(path): shutil.rmtree(path) else: From c31f547f7db52bd911981f602e694899e24abe9c Mon Sep 17 00:00:00 2001 From: Bryan Worrell Date: Tue, 6 Apr 2021 09:10:25 -0400 Subject: [PATCH 7/7] Instead of moving files, make a gzip tarball for backups. --- app/service/data_svc.py | 50 ++++++++++++++++------------------------- server.py | 4 ++-- 2 files changed, 21 insertions(+), 33 deletions(-) diff --git a/app/service/data_svc.py b/app/service/data_svc.py index 39e8ad30a..0169c9486 100644 --- a/app/service/data_svc.py +++ b/app/service/data_svc.py @@ -1,8 +1,10 @@ import asyncio import copy +import datetime import glob import os import pickle +import tarfile import shutil import warnings from base64 import b64encode @@ -58,46 +60,32 @@ def _iter_data_files(): yield f @staticmethod - def _delete_directory_contents(path): - """Delete all files and subdirectories under `path`. - - Note: - This uses `glob` and thus, ignores top-level files - that start with a '.' (e.g., '.gitkeep'. Any subdirectories - are deleted entirely (even if they contain '.' files). - """ + def _delete_file(path): if not os.path.exists(path): return - - if not os.path.isdir(path): - raise ValueError(f'Input path must be a directory. Received {path}') - - for path in glob.glob(f'{path}/*'): - if os.path.isdir(path): - shutil.rmtree(path) - else: - os.remove(path) + elif os.path.isdir(path): + shutil.rmtree(path) + else: + os.remove(path) @staticmethod async def destroy(): - """Clear the caldera data directory and server state. - - This moves all data files and the object store to the data backup directory. - The original data file paths are preserved under the backup directory. + """Reset the caldera data directory and server state. - Example (original path -> new backup path): - data/results/23deddf-ff3f2.yml -> backup/data/results/23deddf-ff3f2.yml + This creates a gzipped tarball backup of the data files tracked by caldera. + Paths are preserved within the tarball, with all files having "data/" as the + root. """ - if os.path.exists(DATA_BACKUP_DIR): - DataService._delete_directory_contents(DATA_BACKUP_DIR) - else: + if not os.path.exists(DATA_BACKUP_DIR): os.mkdir(DATA_BACKUP_DIR) - for file_path in DataService._iter_data_files(): - src_dir, src_filename = os.path.split(file_path) - dst_dir = os.path.join(DATA_BACKUP_DIR, src_dir) - os.makedirs(dst_dir, exist_ok=True) - shutil.move(file_path, os.path.join(dst_dir, src_filename)) + timestamp = datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') + tarball_path = os.path.join(DATA_BACKUP_DIR, f'backup-{timestamp}.tar.gz') + + with tarfile.open(tarball_path, 'w:gz') as tarball: + for file_path in DataService._iter_data_files(): + tarball.add(file_path) + DataService._delete_file(file_path) async def save_state(self): await self._prune_non_critical_data() diff --git a/server.py b/server.py index ed74ce01f..c7ba02326 100755 --- a/server.py +++ b/server.py @@ -13,7 +13,7 @@ from app.service.app_svc import AppService from app.service.auth_svc import AuthService from app.service.contact_svc import ContactService -from app.service.data_svc import DataService +from app.service.data_svc import DataService, DATA_BACKUP_DIR from app.service.event_svc import EventService from app.service.file_svc import FileSvc from app.service.learning_svc import LearningService @@ -122,7 +122,7 @@ def list_str(values): init_swagger_documentation(app_svc.application) if args.fresh: - logging.info("Fresh startup: removing server data files") + logging.info("Fresh startup: resetting server data. See %s directory for data backups.", DATA_BACKUP_DIR) asyncio.get_event_loop().run_until_complete(data_svc.destroy()) run_tasks(services=app_svc.get_services())