Skip to content

Commit

Permalink
[pp:hash] add 'hash' post processor (#6099)
Browse files Browse the repository at this point in the history
  • Loading branch information
mikf committed Aug 31, 2024
1 parent f52cf54 commit ae9b0da
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 0 deletions.
59 changes: 59 additions & 0 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5586,6 +5586,63 @@ Description
See `metadata.event`_ for a list of available events.


hash.chunk-size
---------------
Type
``integer``
Default
``32768``
Description
Number of bytes read per chunk during file hash computation.


hash.event
----------
Type
* ``string``
* ``list`` of ``strings``
Default
``"file"``
Description
The event(s) for which `file hashes <hash.hashes_>`__ are computed.

See `metadata.event`_ for a list of available events.


hash.filename
-------------
Type
* ``bool``
Default
``false``
Description
Rebuild `filenames <extractor.*.filename_>`__ after computing
`hash digests <hash.hashes_>`__ and adding them to the metadata dict.


hash.hashes
-----------
Type
* ``string``
* ``object`` (`field name` -> `hash algorithm`)
Default
``"md5,sha1"``
Example
.. code:: json
"sha256:hash_sha,sha3_512:hash_sha3"
.. code:: json
{
"hash_sha" : "sha256",
"hash_sha3": "sha3_512"
}
Description
Hash digests to compute.


metadata.mode
-------------
Type
Expand Down Expand Up @@ -6694,6 +6751,8 @@ Description
| (requires `downloader.*.part`_ = ``true`` and `extractor.*.skip`_ = ``false``)
``exec``
Execute external commands
``hash``
Compute file hash digests
``metadata``
Write metadata to separate files
``mtime``
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/postprocessor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"classify",
"compare",
"exec",
"hash",
"metadata",
"mtime",
"python",
Expand Down
71 changes: 71 additions & 0 deletions gallery_dl/postprocessor/hash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# -*- coding: utf-8 -*-

# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Compute file hash digests"""

from .common import PostProcessor
import hashlib


class HashPP(PostProcessor):

def __init__(self, job, options):
PostProcessor.__init__(self, job)

self.chunk_size = options.get("chunk-size", 32768)
self.filename = options.get("filename")

hashes = options.get("hashes")
if isinstance(hashes, dict):
self.hashes = list(hashes.items())
elif isinstance(hashes, str):
self.hashes = []
for h in hashes.split(","):
name, sep, key = h.partition(":")
self.hashes.append((key if sep else name, name))
elif hashes:
self.hashes = hashes
else:
self.hashes = (("md5", "md5"), ("sha1", "sha1"))

events = options.get("event")
if events is None:
events = ("file",)
elif isinstance(events, str):
events = events.split(",")
job.register_hooks({event: self.run for event in events}, options)

def run(self, pathfmt):
hashes = [
(key, hashlib.new(name))
for key, name in self.hashes
]

size = self.chunk_size
with self._open(pathfmt) as fp:
while True:
data = fp.read(size)
if not data:
break
for _, h in hashes:
h.update(data)

for key, h in hashes:
pathfmt.kwdict[key] = h.hexdigest()

if self.filename:
pathfmt.build_path()

def _open(self, pathfmt):
try:
return open(pathfmt.temppath, "rb")
except OSError:
return open(pathfmt.realpath, "rb")


__postprocessor__ = HashPP
51 changes: 51 additions & 0 deletions test/test_postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,57 @@ def test_async(self):
self.assertFalse(i.wait.called)


class HashTest(BasePostprocessorTest):

def test_default(self):
self._create({})

with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")

self._trigger()

kwdict = self.pathfmt.kwdict
self.assertEqual(
"35c9c9c7c90ad764bae9e2623f522c24", kwdict["md5"], "md5")
self.assertEqual(
"14d3d804494ef4e57d72de63e4cfee761240471a", kwdict["sha1"], "sha1")

def test_custom_hashes(self):
self._create({"hashes": "sha256:a,sha512:b"})

with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")

self._trigger()

kwdict = self.pathfmt.kwdict
self.assertEqual(
"4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395",
kwdict["a"], "sha256")
self.assertEqual(
"6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700"
"3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62",
kwdict["b"], "sha512")

def test_custom_hashes_dict(self):
self._create({"hashes": {"a": "sha256", "b": "sha512"}})

with self.pathfmt.open() as fp:
fp.write(b"Foo Bar\n")

self._trigger()

kwdict = self.pathfmt.kwdict
self.assertEqual(
"4775b55be17206445d7015a5fc7656f38a74b880670523c3b175455f885f2395",
kwdict["a"], "sha256")
self.assertEqual(
"6028f9e6957f4ca929941318c4bba6258713fd5162f9e33bd10e1c456d252700"
"3e1095b50736c4fd1e2deea152e3c8ecd5993462a747208e4d842659935a1c62",
kwdict["b"], "sha512")


class MetadataTest(BasePostprocessorTest):

def test_metadata_default(self):
Expand Down

0 comments on commit ae9b0da

Please sign in to comment.