Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Define a tool for versioning external data dependencies. #9

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 144 additions & 0 deletions newhelm/dependency_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
import glob
import os
import shutil
import tempfile
from typing import Dict, Mapping, Optional

from newhelm.external_data import ExternalData
from newhelm.general import current_timestamp_millis, from_json, hash_file, to_json


class DependencyHelper(ABC):
@abstractmethod
def get_local_path(self, dependency_key: str) -> str:
"""Return a path to the dependency, downloading as needed."""

@abstractmethod
def versions_used(self) -> Mapping[str, str]:
"""Report the version of all dependencies accessed during this run."""

@abstractmethod
def update_all_dependencies(self) -> Mapping[str, str]:
"""Ensure the local system has the latest version of all dependencies."""


@dataclass(frozen=True)
class DependencyVersionMetadata:
"""Data object we can store along side a dependency version."""

version: str
creation_time_millis: int = field(default_factory=current_timestamp_millis)


class FromSourceDependencyHelper(DependencyHelper):
"""When a dependency isn't available locally, download from the primary source.

When used, the local directory structure will look like this:
data_dir/
dependency_1/
version_x.metadata
version_x/
<dependency's data>
version_y.metadata
version_y/
<dependency's data>
...
dependency_2/
...
...
"""

def __init__(
self,
data_dir,
dependencies: Mapping[str, ExternalData],
required_versions: Mapping[str, str],
):
self.data_dir = data_dir
self.dependencies = dependencies
self.required_versions = required_versions
self.used_dependencies: Dict[str, str] = {}

def get_local_path(self, dependency_key: str) -> str:
assert dependency_key in self.dependencies
external_data: ExternalData = self.dependencies[dependency_key]

version: str
if dependency_key in self.required_versions:
version = self.required_versions[dependency_key]
self._ensure_required_version_exists(dependency_key, external_data, version)
else:
version = self._get_latest_version(dependency_key, external_data)
self.used_dependencies[dependency_key] = version
return self._get_version_path(dependency_key, version)

def versions_used(self) -> Mapping[str, str]:
return self.used_dependencies

def update_all_dependencies(self):
latest_versions = {}
for dependency_key, external_data in self.dependencies.items():
latest_versions[dependency_key] = self._store_dependency(
dependency_key, external_data
)
return latest_versions

def _ensure_required_version_exists(
self, dependency_key: str, external_data: ExternalData, version: str
) -> None:
version_path = self._get_version_path(dependency_key, version)
if os.path.exists(version_path):
return
# See if downloading from the source creates that version.
stored_version = self._store_dependency(dependency_key, external_data)
if stored_version != version:
raise RuntimeError(
f"Could not retrieve version {version} for dependency {dependency_key}. Source currently returns version {stored_version}."
)

def _get_latest_version(self, dependency_key, external_data) -> str:
"""Use the latest cached version. If none cached, download from source."""
version = self._find_latest_cached_version(dependency_key)
if version is not None:
return version
return self._store_dependency(dependency_key, external_data)

def _get_version_path(self, dependency_key: str, version: str) -> str:
# TODO Here or earlier, ensure dependency_key has no filesystem characters (e.g. '/').
return os.path.join(self.data_dir, dependency_key, version)

def _find_latest_cached_version(self, dependency_key: str) -> Optional[str]:
# TODO Here or earlier, ensure dependency_key has no filesystem characters (e.g. '/').
metadata_files = glob.glob(
os.path.join(self.data_dir, dependency_key, "*.metadata")
)
version_creation: Dict[str, int] = {}
for filename in metadata_files:
with open(filename, "r") as f:
metadata = from_json(DependencyVersionMetadata, f.read())
version_creation[metadata.version] = metadata.creation_time_millis
if not version_creation:
return None
# Returns the key with the max value
return max(
version_creation.keys(), key=lambda dict_key: version_creation[dict_key]
)

def _store_dependency(self, dependency_key, external_data: ExternalData) -> str:
with tempfile.TemporaryDirectory() as tmpdirname:
tmp_location = os.path.join(tmpdirname, dependency_key)
external_data.download(tmp_location)
version = hash_file(tmp_location)
final_path = self._get_version_path(dependency_key, version)
if os.path.exists(final_path):
# TODO Allow for overwriting
return version
os.makedirs(os.path.join(self.data_dir, dependency_key), exist_ok=True)
# TODO all the fancy unpacking in HELM's ensure_file_download.
os.rename(tmp_location, final_path)
metadata_file = final_path + ".metadata"
with open(metadata_file, "w") as f:
f.write(to_json(DependencyVersionMetadata(version)))
return version
211 changes: 211 additions & 0 deletions newhelm/dependency_helper_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
import time

import pytest
from newhelm.dependency_helper import (
DependencyVersionMetadata,
FromSourceDependencyHelper,
)
from newhelm.external_data import ExternalData
from newhelm.general import from_json


class MockExternalData(ExternalData):
"""Fully in memory ExternalData that counts download calls."""

def __init__(self, text):
self.download_calls = 0
self.text = text

def download(self, location):
self.download_calls += 1
with open(location, "w") as f:
f.write(self.text)


# This is the sha256 of a file containing "data-1".
_DATA_1_HASH = "51bbfa74f8660493f40fd72068f63af436ee13c283ca84c373d9690ff2f1f83c"
# This is the sha256 of a file containing "data-2".
_DATA_2_HASH = "00c2022f72beeabc82c8f02099df7abebe43292bac3f44bf63f5827a8c50255a"


def test_from_source_single_read(tmpdir):
dependencies = {
"d1": MockExternalData("data-1"),
"d2": MockExternalData("data-2"),
}
helper = FromSourceDependencyHelper(
tmpdir.strpath, dependencies, required_versions={}
)

# Get the d1 dependency
d1_path = helper.get_local_path("d1")

assert d1_path.endswith(f"d1/{_DATA_1_HASH}")
assert helper.versions_used() == {"d1": _DATA_1_HASH}
assert dependencies["d1"].download_calls == 1
assert dependencies["d2"].download_calls == 0

# Ensure the file contains the expected data.
with open(d1_path, "r") as f:
d1_from_file = f.read()
assert d1_from_file == "data-1"

# Ensure the .metadata file was written
with open(d1_path + ".metadata", "r") as f:
metadata = from_json(DependencyVersionMetadata, f.read())
assert metadata.version == _DATA_1_HASH
assert metadata.creation_time_millis > 0


def test_from_source_reads_cached(tmpdir):
dependencies = {
"d1": MockExternalData("data-1"),
"d2": MockExternalData("data-2"),
}

helper = FromSourceDependencyHelper(
tmpdir.strpath, dependencies, required_versions={}
)
d1_path = helper.get_local_path("d1")
d1_path_again = helper.get_local_path("d1")
assert d1_path == d1_path_again
assert dependencies["d1"].download_calls == 1


def test_from_source_update_all(tmpdir):
dependencies = {
"d1": MockExternalData("data-1"),
"d2": MockExternalData("data-2"),
}
helper = FromSourceDependencyHelper(
tmpdir.strpath, dependencies, required_versions={}
)
versions = helper.update_all_dependencies()
assert versions == {
"d1": _DATA_1_HASH,
"d2": _DATA_2_HASH,
}
assert dependencies["d1"].download_calls == 1
assert dependencies["d2"].download_calls == 1
# Nothing has actually been read.
assert helper.versions_used() == {}


def test_from_source_required_version_already_exists(tmpdir):
# Make the old version
old_dependencies = {
"d1": MockExternalData("data-1"),
}
old_helper = FromSourceDependencyHelper(
tmpdir.strpath, old_dependencies, required_versions={}
)

# Get the d1 dependency
old_d1_path = old_helper.get_local_path("d1")

new_dependencies = {
"d1": MockExternalData("updated-data-1"),
}
new_helper = FromSourceDependencyHelper(
tmpdir.strpath, new_dependencies, required_versions={"d1": _DATA_1_HASH}
)
new_d1_path = new_helper.get_local_path("d1")
assert old_d1_path == new_d1_path
# Ensure it was read from storage.
assert new_dependencies["d1"].download_calls == 0


def test_from_source_required_version_is_live(tmpdir):
dependencies = {
"d1": MockExternalData("data-1"),
}
helper = FromSourceDependencyHelper(
tmpdir.strpath, dependencies, required_versions={"d1": _DATA_1_HASH}
)

# Get the d1 dependency
d1_path = helper.get_local_path("d1")
assert d1_path.endswith(_DATA_1_HASH)
assert dependencies["d1"].download_calls == 1


def test_from_source_required_version_unavailable(tmpdir):
dependencies = {
"d1": MockExternalData("data-1"),
}
helper = FromSourceDependencyHelper(
tmpdir.strpath, dependencies, required_versions={"d1": "not-real"}
)

with pytest.raises(RuntimeError, match="version not-real for dependency d1"):
helper.get_local_path("d1")


def test_from_source_require_older_version(tmpdir):
# First write a version of 'd1' with contents of 'data-1'.
old_dependencies = {
"d1": MockExternalData("data-1"),
}
old_helper = FromSourceDependencyHelper(
tmpdir.strpath, old_dependencies, required_versions={}
)
old_d1_path = old_helper.get_local_path("d1")
time.sleep(0.05) # Ensure timestamp of old is actually older.

# Now write a newer version of d1
new_dependencies = {
"d1": MockExternalData("updated-data-1"),
}
new_helper = FromSourceDependencyHelper(
tmpdir.strpath, new_dependencies, required_versions={}
)
# Force reading the new data.
new_helper.update_all_dependencies()
new_d1_path = new_helper.get_local_path("d1")
assert old_d1_path != new_d1_path

# Finally, set up a helper with a required version.
required_version_helper = FromSourceDependencyHelper(
tmpdir.strpath, new_dependencies, required_versions={"d1": _DATA_1_HASH}
)
required_version_d1_path = required_version_helper.get_local_path("d1")
assert new_d1_path != required_version_d1_path
with open(new_d1_path, "r") as f:
assert f.read() == "updated-data-1"
with open(required_version_d1_path, "r") as f:
assert f.read() == "data-1"


def test_from_source_use_newest_version(tmpdir):
# First write a version of 'd1' with contents of 'data-1'.
old_dependencies = {
"d1": MockExternalData("data-1"),
}
old_helper = FromSourceDependencyHelper(
tmpdir.strpath, old_dependencies, required_versions={}
)
old_d1_path = old_helper.get_local_path("d1")
time.sleep(0.05) # Ensure timestamp of old is actually older.

# Now write a newer version of d1
new_dependencies = {
"d1": MockExternalData("updated-data-1"),
}
new_helper = FromSourceDependencyHelper(
tmpdir.strpath, new_dependencies, required_versions={}
)
# Force reading the new data.
new_helper.update_all_dependencies()
new_d1_path = new_helper.get_local_path("d1")
assert old_d1_path != new_d1_path

# Finally, set up a helper with no required version
latest_version_helper = FromSourceDependencyHelper(
tmpdir.strpath, new_dependencies, required_versions={}
)
latest_version_d1_path = latest_version_helper.get_local_path("d1")
assert old_d1_path != latest_version_d1_path
with open(old_d1_path, "r") as f:
assert f.read() == "data-1"
with open(latest_version_d1_path, "r") as f:
assert f.read() == "updated-data-1"
21 changes: 21 additions & 0 deletions newhelm/external_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from abc import ABC, abstractmethod

from newhelm.general import shell


class ExternalData(ABC):
"""Base class for defining a source of external data."""

@abstractmethod
def download(self, location):
pass


class WebData(ExternalData):
"""External data that can be trivially downloaded using wget."""

def __init__(self, source_url):
self.source_url = source_url

def download(self, location):
shell(["wget", self.source_url, "-O", location])
Loading