From 7caa128d9282e6bffb9fecc7c6f2cea968dc9f32 Mon Sep 17 00:00:00 2001 From: Brian Goldman Date: Fri, 1 Dec 2023 13:45:47 -0700 Subject: [PATCH 1/2] Define a versioning external data dependencies. This helper is intended to intermediate between the external source and our code (e.g. Scenario), using sha256 hashing to version external data. --- newhelm/dependency_helper.py | 146 +++++++++++++++++++++++++ newhelm/dependency_helper_test.py | 176 ++++++++++++++++++++++++++++++ newhelm/external_data.py | 17 +++ newhelm/general.py | 13 +++ 4 files changed, 352 insertions(+) create mode 100644 newhelm/dependency_helper.py create mode 100644 newhelm/dependency_helper_test.py create mode 100644 newhelm/external_data.py diff --git a/newhelm/dependency_helper.py b/newhelm/dependency_helper.py new file mode 100644 index 00000000..1c1c5121 --- /dev/null +++ b/newhelm/dependency_helper.py @@ -0,0 +1,146 @@ +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +import glob +import os +import shutil +import tempfile +from typing import Dict, Mapping, Optional + +from newhelm.external_data import ExternalData +from newhelm.general import current_timestamp_millis, from_json, hash_file, to_json + + +class DependencyHelper(ABC): + @abstractmethod + def get_local_path(self, dependency_key: str) -> str: + """Return a path to the dependency, downloading as needed.""" + + @abstractmethod + def versions_used(self) -> Mapping[str, str]: + """Report the version of all dependencies accessed during this run.""" + + @abstractmethod + def update_all_dependencies(self) -> Mapping[str, str]: + """Ensure the local system has the latest version of all dependencies.""" + + +@dataclass(frozen=True) +class DependencyVersionMetadata: + """Data object we can store along side a dependency version.""" + + version: str + creation_time_millis: int = field(default_factory=current_timestamp_millis) + + +class FromSourceDependencyHelper(DependencyHelper): + """When a dependency isn't available locally, download from the primary source. + + When used, the local directory structure will look like this: + data_dir/ + dependency_1/ + version_x.metadata + version_x/ + + version_y.metadata + version_y/ + + ... + dependency_2/ + ... + ... + """ + + def __init__( + self, + data_dir, + dependencies: Mapping[str, ExternalData], + required_versions: Mapping[str, str], + ): + self.data_dir = data_dir + self.dependencies = dependencies + self.required_versions = required_versions + self.used_dependencies: Dict[str, str] = {} + + def get_local_path(self, dependency_key: str) -> str: + assert dependency_key in self.dependencies + external_data: ExternalData = self.dependencies[dependency_key] + + version: str + if dependency_key in self.required_versions: + version = self.required_versions[dependency_key] + self._ensure_required_version_exists(dependency_key, external_data, version) + else: + version = self._get_latest_version(dependency_key, external_data) + self.used_dependencies[dependency_key] = version + return self._get_version_path(dependency_key, version) + + def versions_used(self) -> Mapping[str, str]: + return self.used_dependencies + + def update_all_dependencies(self): + latest_versions = {} + for dependency_key, external_data in self.dependencies.items(): + latest_versions[dependency_key] = self._store_dependency( + dependency_key, external_data + ) + return latest_versions + + def _ensure_required_version_exists( + self, dependency_key: str, external_data: ExternalData, version: str + ) -> None: + version_path = self._get_version_path(dependency_key, version) + if os.path.exists(version_path): + return + # See if downloading from the source creates that version. + stored_version = self._store_dependency(dependency_key, external_data) + if stored_version != version: + raise RuntimeError( + f"Could not retrieve version {version} for dependency {dependency_key}. Source currently returns version {stored_version}." + ) + + def _get_latest_version(self, dependency_key, external_data) -> str: + """Use the latest cached version. If none cached, download from source.""" + version = self._find_latest_cached_version(dependency_key) + if version is not None: + return version + return self._store_dependency(dependency_key, external_data) + + def _get_version_path(self, dependency_key: str, version: str) -> str: + # TODO Here or earlier, ensure dependency_key has no filesystem characters (e.g. '/'). + return os.path.join(self.data_dir, dependency_key, version) + + def _find_latest_cached_version(self, dependency_key: str) -> Optional[str]: + # TODO Here or earlier, ensure dependency_key has no filesystem characters (e.g. '/'). + metadata_files = glob.glob( + os.path.join(self.data_dir, dependency_key, "*.metadata") + ) + version_creation: Dict[str, int] = {} + for filename in metadata_files: + with open(filename, "r") as f: + metadata = from_json(DependencyVersionMetadata, f.read()) + version_creation[metadata.version] = metadata.creation_time_millis + if not version_creation: + return None + # Returns the key with the max value + return max( + version_creation.keys(), key=lambda dict_key: version_creation[dict_key] + ) + + def _store_dependency(self, dependency_key, external_data: ExternalData) -> str: + with tempfile.TemporaryDirectory() as tmpdirname: + tmp_location = os.path.join(tmpdirname, dependency_key) + external_data.download(tmp_location) + version = hash_file(tmp_location) + final_path = self._get_version_path(dependency_key, version) + if os.path.exists(final_path): + # TODO Allow for overwriting + return version + os.makedirs(os.path.join(self.data_dir, dependency_key), exist_ok=True) + os.rename(tmp_location, final_path) + # os.makedirs(final_path, exist_ok=True) + # shutil.move(tmp_location, final_path) + # TODO all the fancy unpacking in HELM's ensure_file_download. + metadata_file = final_path + ".metadata" + with open(metadata_file, "w") as f: + f.write(to_json(DependencyVersionMetadata(version))) + return version diff --git a/newhelm/dependency_helper_test.py b/newhelm/dependency_helper_test.py new file mode 100644 index 00000000..e706220f --- /dev/null +++ b/newhelm/dependency_helper_test.py @@ -0,0 +1,176 @@ +import time + +import pytest +from newhelm.dependency_helper import ( + DependencyVersionMetadata, + FromSourceDependencyHelper, +) +from newhelm.external_data import ExternalData +from newhelm.general import from_json + + +class MockExternalData(ExternalData): + def __init__(self, text): + self.download_calls = 0 + self.text = text + + def download(self, location): + self.download_calls += 1 + with open(location, "w") as f: + f.write(self.text) + + +def test_from_source_single_read(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + "d2": MockExternalData("data-2"), + } + # This is the hash for "data-1" + d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={} + ) + + # Get the d1 dependency + d1_path = helper.get_local_path("d1") + + assert d1_path.endswith(f"d1/{d1_hash}") + assert helper.versions_used() == {"d1": d1_hash} + assert dependencies["d1"].download_calls == 1 + assert dependencies["d2"].download_calls == 0 + + # Ensure the file contains the expected data. + with open(d1_path, "r") as f: + d1_from_file = f.read() + assert d1_from_file == "data-1" + + # Ensure the .metadata file was written + with open(d1_path + ".metadata", "r") as f: + metadata = from_json(DependencyVersionMetadata, f.read()) + assert metadata.version == d1_hash + assert metadata.creation_time_millis > 0 + + +def test_from_source_required_version_already_exists(tmpdir): + # Make the old version + old_dependencies = { + "d1": MockExternalData("data-1"), + } + # This is the hash for "data-1" + d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" + old_helper = FromSourceDependencyHelper( + tmpdir.strpath, old_dependencies, required_versions={} + ) + + # Get the d1 dependency + old_d1_path = old_helper.get_local_path("d1") + + new_dependencies = { + "d1": MockExternalData("updated-data-1"), + } + new_helper = FromSourceDependencyHelper( + tmpdir.strpath, new_dependencies, required_versions={"d1": d1_hash} + ) + new_d1_path = new_helper.get_local_path("d1") + assert old_d1_path == new_d1_path + # Read it from storage. + assert new_dependencies["d1"].download_calls == 0 + + +def test_from_source_required_version_is_live(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + } + # This is the hash for "data-1" + d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={"d1": d1_hash} + ) + + # Get the d1 dependency + d1_path = helper.get_local_path("d1") + assert d1_path.endswith(d1_hash) + + +def test_from_source_required_version_unavailable(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + } + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={"d1": "not-real"} + ) + + with pytest.raises(RuntimeError, match="version not-real for dependency d1"): + helper.get_local_path("d1") + + +def test_from_source_require_older_version(tmpdir): + # First write a version of 'd1' with contents of 'data-1'. + old_dependencies = { + "d1": MockExternalData("data-1"), + } + # This is the hash for "data-1" + d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" + old_helper = FromSourceDependencyHelper( + tmpdir.strpath, old_dependencies, required_versions={} + ) + old_d1_path = old_helper.get_local_path("d1") + time.sleep(0.05) # Ensure timestamp of old is actually older. + + # Now write a newer version of d1 + new_dependencies = { + "d1": MockExternalData("updated-data-1"), + } + new_helper = FromSourceDependencyHelper( + tmpdir.strpath, new_dependencies, required_versions={} + ) + # Force reading the new data. + new_helper.update_all_dependencies() + new_d1_path = new_helper.get_local_path("d1") + assert old_d1_path != new_d1_path + + # Finally, set up a helper with a required version. + required_version_helper = FromSourceDependencyHelper( + tmpdir.strpath, new_dependencies, required_versions={"d1": d1_hash} + ) + required_version_d1_path = required_version_helper.get_local_path("d1") + assert new_d1_path != required_version_d1_path + with open(new_d1_path, "r") as f: + assert f.read() == "updated-data-1" + with open(required_version_d1_path, "r") as f: + assert f.read() == "data-1" + + +def test_from_source_reads_cached(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + "d2": MockExternalData("data-2"), + } + + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={} + ) + d1_path = helper.get_local_path("d1") + d1_path_again = helper.get_local_path("d1") + assert d1_path == d1_path_again + assert dependencies["d1"].download_calls == 1 + + +def test_from_source_update_all(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + "d2": MockExternalData("data-2"), + } + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={} + ) + versions = helper.update_all_dependencies() + assert versions == { + "d1": "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c", + "d2": "00c2022f72beeabc82c8f02099df7abebe43292bac3f44bf63f5827a8c50255a", + } + assert dependencies["d1"].download_calls == 1 + assert dependencies["d2"].download_calls == 1 + # Nothing has actually been read. + assert helper.versions_used() == {} + # TODO read files? diff --git a/newhelm/external_data.py b/newhelm/external_data.py new file mode 100644 index 00000000..7f603a31 --- /dev/null +++ b/newhelm/external_data.py @@ -0,0 +1,17 @@ +from abc import ABC, abstractmethod + +from newhelm.general import shell + + +class ExternalData(ABC): + @abstractmethod + def download(self, location): + pass + + +class WebData(ExternalData): + def __init__(self, source_url): + self.source_url = source_url + + def download(self, location): + shell(["wget", self.source_url, "-O", location]) diff --git a/newhelm/general.py b/newhelm/general.py index 51ec757a..537946cc 100644 --- a/newhelm/general.py +++ b/newhelm/general.py @@ -1,4 +1,5 @@ from dataclasses import asdict, is_dataclass +import hashlib import json import shlex import subprocess @@ -43,3 +44,15 @@ def shell(args: List[str]): exit_code = subprocess.call(args) if exit_code != 0: print(f"Failed with exit code {exit_code}: {cmd}") + + +def hash_file(filename, block_size=65536): + file_hash = hashlib.sha256() + with open(filename, "rb") as f: + while True: + block = f.read(block_size) + if not block: + break + file_hash.update(block) + + return file_hash.hexdigest() From ba92f2cfab8c58d323e24f49f0af1e832984da23 Mon Sep 17 00:00:00 2001 From: Brian Goldman Date: Mon, 4 Dec 2023 09:06:03 -0700 Subject: [PATCH 2/2] Clean up test file and add a few missing doc strings. --- newhelm/dependency_helper.py | 4 +- newhelm/dependency_helper_test.py | 121 +++++++++++++++++++----------- newhelm/external_data.py | 4 + newhelm/general.py | 1 + 4 files changed, 84 insertions(+), 46 deletions(-) diff --git a/newhelm/dependency_helper.py b/newhelm/dependency_helper.py index 1c1c5121..869a9e16 100644 --- a/newhelm/dependency_helper.py +++ b/newhelm/dependency_helper.py @@ -136,10 +136,8 @@ def _store_dependency(self, dependency_key, external_data: ExternalData) -> str: # TODO Allow for overwriting return version os.makedirs(os.path.join(self.data_dir, dependency_key), exist_ok=True) - os.rename(tmp_location, final_path) - # os.makedirs(final_path, exist_ok=True) - # shutil.move(tmp_location, final_path) # TODO all the fancy unpacking in HELM's ensure_file_download. + os.rename(tmp_location, final_path) metadata_file = final_path + ".metadata" with open(metadata_file, "w") as f: f.write(to_json(DependencyVersionMetadata(version))) diff --git a/newhelm/dependency_helper_test.py b/newhelm/dependency_helper_test.py index e706220f..ea6d5f72 100644 --- a/newhelm/dependency_helper_test.py +++ b/newhelm/dependency_helper_test.py @@ -10,6 +10,8 @@ class MockExternalData(ExternalData): + """Fully in memory ExternalData that counts download calls.""" + def __init__(self, text): self.download_calls = 0 self.text = text @@ -20,13 +22,17 @@ def download(self, location): f.write(self.text) +# This is the sha256 of a file containing "data-1". +_DATA_1_HASH = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" +# This is the sha256 of a file containing "data-2". +_DATA_2_HASH = "00c2022f72beeabc82c8f02099df7abebe43292bac3f44bf63f5827a8c50255a" + + def test_from_source_single_read(tmpdir): dependencies = { "d1": MockExternalData("data-1"), "d2": MockExternalData("data-2"), } - # This is the hash for "data-1" - d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" helper = FromSourceDependencyHelper( tmpdir.strpath, dependencies, required_versions={} ) @@ -34,8 +40,8 @@ def test_from_source_single_read(tmpdir): # Get the d1 dependency d1_path = helper.get_local_path("d1") - assert d1_path.endswith(f"d1/{d1_hash}") - assert helper.versions_used() == {"d1": d1_hash} + assert d1_path.endswith(f"d1/{_DATA_1_HASH}") + assert helper.versions_used() == {"d1": _DATA_1_HASH} assert dependencies["d1"].download_calls == 1 assert dependencies["d2"].download_calls == 0 @@ -47,17 +53,49 @@ def test_from_source_single_read(tmpdir): # Ensure the .metadata file was written with open(d1_path + ".metadata", "r") as f: metadata = from_json(DependencyVersionMetadata, f.read()) - assert metadata.version == d1_hash + assert metadata.version == _DATA_1_HASH assert metadata.creation_time_millis > 0 +def test_from_source_reads_cached(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + "d2": MockExternalData("data-2"), + } + + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={} + ) + d1_path = helper.get_local_path("d1") + d1_path_again = helper.get_local_path("d1") + assert d1_path == d1_path_again + assert dependencies["d1"].download_calls == 1 + + +def test_from_source_update_all(tmpdir): + dependencies = { + "d1": MockExternalData("data-1"), + "d2": MockExternalData("data-2"), + } + helper = FromSourceDependencyHelper( + tmpdir.strpath, dependencies, required_versions={} + ) + versions = helper.update_all_dependencies() + assert versions == { + "d1": _DATA_1_HASH, + "d2": _DATA_2_HASH, + } + assert dependencies["d1"].download_calls == 1 + assert dependencies["d2"].download_calls == 1 + # Nothing has actually been read. + assert helper.versions_used() == {} + + def test_from_source_required_version_already_exists(tmpdir): # Make the old version old_dependencies = { "d1": MockExternalData("data-1"), } - # This is the hash for "data-1" - d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" old_helper = FromSourceDependencyHelper( tmpdir.strpath, old_dependencies, required_versions={} ) @@ -69,11 +107,11 @@ def test_from_source_required_version_already_exists(tmpdir): "d1": MockExternalData("updated-data-1"), } new_helper = FromSourceDependencyHelper( - tmpdir.strpath, new_dependencies, required_versions={"d1": d1_hash} + tmpdir.strpath, new_dependencies, required_versions={"d1": _DATA_1_HASH} ) new_d1_path = new_helper.get_local_path("d1") assert old_d1_path == new_d1_path - # Read it from storage. + # Ensure it was read from storage. assert new_dependencies["d1"].download_calls == 0 @@ -81,15 +119,14 @@ def test_from_source_required_version_is_live(tmpdir): dependencies = { "d1": MockExternalData("data-1"), } - # This is the hash for "data-1" - d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" helper = FromSourceDependencyHelper( - tmpdir.strpath, dependencies, required_versions={"d1": d1_hash} + tmpdir.strpath, dependencies, required_versions={"d1": _DATA_1_HASH} ) # Get the d1 dependency d1_path = helper.get_local_path("d1") - assert d1_path.endswith(d1_hash) + assert d1_path.endswith(_DATA_1_HASH) + assert dependencies["d1"].download_calls == 1 def test_from_source_required_version_unavailable(tmpdir): @@ -109,8 +146,6 @@ def test_from_source_require_older_version(tmpdir): old_dependencies = { "d1": MockExternalData("data-1"), } - # This is the hash for "data-1" - d1_hash = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c" old_helper = FromSourceDependencyHelper( tmpdir.strpath, old_dependencies, required_versions={} ) @@ -131,7 +166,7 @@ def test_from_source_require_older_version(tmpdir): # Finally, set up a helper with a required version. required_version_helper = FromSourceDependencyHelper( - tmpdir.strpath, new_dependencies, required_versions={"d1": d1_hash} + tmpdir.strpath, new_dependencies, required_versions={"d1": _DATA_1_HASH} ) required_version_d1_path = required_version_helper.get_local_path("d1") assert new_d1_path != required_version_d1_path @@ -141,36 +176,36 @@ def test_from_source_require_older_version(tmpdir): assert f.read() == "data-1" -def test_from_source_reads_cached(tmpdir): - dependencies = { +def test_from_source_use_newest_version(tmpdir): + # First write a version of 'd1' with contents of 'data-1'. + old_dependencies = { "d1": MockExternalData("data-1"), - "d2": MockExternalData("data-2"), } - - helper = FromSourceDependencyHelper( - tmpdir.strpath, dependencies, required_versions={} + old_helper = FromSourceDependencyHelper( + tmpdir.strpath, old_dependencies, required_versions={} ) - d1_path = helper.get_local_path("d1") - d1_path_again = helper.get_local_path("d1") - assert d1_path == d1_path_again - assert dependencies["d1"].download_calls == 1 - + old_d1_path = old_helper.get_local_path("d1") + time.sleep(0.05) # Ensure timestamp of old is actually older. -def test_from_source_update_all(tmpdir): - dependencies = { - "d1": MockExternalData("data-1"), - "d2": MockExternalData("data-2"), + # Now write a newer version of d1 + new_dependencies = { + "d1": MockExternalData("updated-data-1"), } - helper = FromSourceDependencyHelper( - tmpdir.strpath, dependencies, required_versions={} + new_helper = FromSourceDependencyHelper( + tmpdir.strpath, new_dependencies, required_versions={} ) - versions = helper.update_all_dependencies() - assert versions == { - "d1": "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c", - "d2": "00c2022f72beeabc82c8f02099df7abebe43292bac3f44bf63f5827a8c50255a", - } - assert dependencies["d1"].download_calls == 1 - assert dependencies["d2"].download_calls == 1 - # Nothing has actually been read. - assert helper.versions_used() == {} - # TODO read files? + # Force reading the new data. + new_helper.update_all_dependencies() + new_d1_path = new_helper.get_local_path("d1") + assert old_d1_path != new_d1_path + + # Finally, set up a helper with no required version + latest_version_helper = FromSourceDependencyHelper( + tmpdir.strpath, new_dependencies, required_versions={} + ) + latest_version_d1_path = latest_version_helper.get_local_path("d1") + assert old_d1_path != latest_version_d1_path + with open(old_d1_path, "r") as f: + assert f.read() == "data-1" + with open(latest_version_d1_path, "r") as f: + assert f.read() == "updated-data-1" diff --git a/newhelm/external_data.py b/newhelm/external_data.py index 7f603a31..669ee71e 100644 --- a/newhelm/external_data.py +++ b/newhelm/external_data.py @@ -4,12 +4,16 @@ class ExternalData(ABC): + """Base class for defining a source of external data.""" + @abstractmethod def download(self, location): pass class WebData(ExternalData): + """External data that can be trivially downloaded using wget.""" + def __init__(self, source_url): self.source_url = source_url diff --git a/newhelm/general.py b/newhelm/general.py index 537946cc..28ec9e49 100644 --- a/newhelm/general.py +++ b/newhelm/general.py @@ -47,6 +47,7 @@ def shell(args: List[str]): def hash_file(filename, block_size=65536): + """Apply sha256 to the bytes of `filename`.""" file_hash = hashlib.sha256() with open(filename, "rb") as f: while True: