Skip to content

Commit

Permalink
Move all hash checking to hash_matches (#157)
Browse files Browse the repository at this point in the history
Including the exception raising controlled by new parameter `strict`.
This removes the need for re-calculating the new hash just so we can
include it in the error message. The hash calculation can be heavy
for large files so we don't want to do it twice. Also improve the error
message to include the hash algorithm and suggest what went wrong.
  • Loading branch information
leouieda authored Apr 2, 2020
1 parent 7ffbc4f commit 4f66acf
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 17 deletions.
18 changes: 4 additions & 14 deletions pooch/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,10 @@

import requests
from .utils import (
file_hash,
check_version,
parse_url,
get_logger,
make_local_storage,
hash_algorithm,
hash_matches,
)
from .downloaders import choose_downloader
Expand Down Expand Up @@ -340,11 +338,13 @@ def mydownloader(url, output_file, pooch):

full_path = self.abspath / fname
url = self.get_url(fname)
known_hash = self.registry[fname]

in_storage = full_path.exists()

if not in_storage:
action = "download"
elif not hash_matches(str(full_path), self.registry[fname]):
elif not hash_matches(str(full_path), known_hash):
action = "update"
else:
action = "fetch"
Expand All @@ -370,17 +370,7 @@ def mydownloader(url, output_file, pooch):
tmp.close()
try:
downloader(url, tmp.name, self)
if not hash_matches(tmp.name, self.registry[fname]):
raise ValueError(
"Hash of downloaded file '{}' doesn't match the entry in the"
" registry. Expected '{}' and got '{}'.".format(
fname,
self.registry[fname],
file_hash(
tmp.name, alg=hash_algorithm(self.registry[fname])
),
)
)
hash_matches(tmp.name, known_hash, strict=True)
# Ensure the parent directory exists in case the file is in a
# subdirectory. Otherwise, move will cause an error.
if not os.path.exists(str(full_path.parent)):
Expand Down
26 changes: 26 additions & 0 deletions pooch/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,29 @@ def test_hash_matches():
for alg in ("sha512", "md5"):
known_hash = "{}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd".format(alg)
assert not hash_matches(fname, known_hash)


def test_hash_matches_strict():
"Make sure the hash checking function raises an exception if strict"
fname = os.path.join(DATA_DIR, "tiny-data.txt")
check_tiny_data(fname)
with open(fname, "rb") as fin:
data = fin.read()
# Check if the check passes
hasher = hashlib.new("sha256")
hasher.update(data)
known_hash = "{}".format(hasher.hexdigest())
assert hash_matches(fname, known_hash, strict=True)
for alg in ("sha512", "md5"):
hasher = hashlib.new(alg)
hasher.update(data)
known_hash = "{}:{}".format(alg, hasher.hexdigest())
assert hash_matches(fname, known_hash, strict=True)
# And also if it fails
bad_hash = "p98oh2dl2j2h2p8e9yfho3fi2e9fhd"
with pytest.raises(ValueError):
hash_matches(fname, bad_hash, strict=True)
for alg in ("sha512", "md5"):
bad_hash = "{}:p98oh2dl2j2h2p8e9yfho3fi2e9fhd".format(alg)
with pytest.raises(ValueError):
hash_matches(fname, bad_hash, strict=True)
19 changes: 16 additions & 3 deletions pooch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def hash_algorithm(hash_string):
return algorithm


def hash_matches(fname, known_hash):
def hash_matches(fname, known_hash, strict=False):
"""
Check if the hash of a file matches a known hash.
Expand All @@ -318,12 +318,25 @@ def hash_matches(fname, known_hash):
known_hash : str
The known hash. Optionally, prepend ``alg:`` to the hash to specify the
hashing algorithm. Default is SHA256.
strict : bool
If True, will raise a :class:`ValueError` if the hash does not match
informing the user that the file may be corrupted.
Returns
-------
is_same : bool
True if the hash matches, False otherwise.
"""
new_hash = file_hash(fname, alg=hash_algorithm(known_hash))
return new_hash == known_hash.split(":")[-1]
algorithm = hash_algorithm(known_hash)
new_hash = file_hash(fname, alg=algorithm)
matches = new_hash == known_hash.split(":")[-1]
if strict and not matches:
raise ValueError(
"{} hash of file '{}' does not match the known hash:"
" expected '{}' but got '{}'. "
" The file may be corrupted or the known hash may be outdated.".format(
algorithm.upper(), fname, known_hash, new_hash,
)
)
return matches

0 comments on commit 4f66acf

Please sign in to comment.