Skip to content

Commit

Permalink
Create patch from .tar instead of .tar.gz (alternative solution) (#105)
Browse files Browse the repository at this point in the history
* make DEFAULT_HASH_ALGORITHM part of Patcher

* implement Patcher that diffs .tar content instead of .tar.gz

* only verify integrity of the final patch result

Verifying individual patches inside the loop does tell us which specific patch failed, but in the end we still mark all patches as .failed, so we'd be doing unnecessary work.

* calculate size and hash when making diff

It would be cleaner, and easier to test, if we separated the diff/patch and hash/verify steps. However, we need to verify right after patching, but *before* writing the dst file.
Thus, the hash verification step was pulled inside the patch method. Now, for consistency, it makes sense to pull the hash creation into the diff method.

* make the size_and_hash methods private

* rewrite patcher tests to match new implementation

* verify is_patch for test data

* introduce MAX_SIZE_RATIO for decision patch vs full update

* adapt client tests to new patcher implementation and update test data to match

* undo parenthesized context (only necessary for python 3.8 support)

* use tarfile instead of shutil in make_gztar_archive, for consistency (note that shutil uses tarfile under the hood anyway)
  • Loading branch information
dennisvang authored Feb 12, 2024
1 parent 61323f9 commit 9910d05
Show file tree
Hide file tree
Showing 15 changed files with 281 additions and 130 deletions.
51 changes: 27 additions & 24 deletions src/tufup/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,20 @@
from typing import Callable, Dict, Iterator, List, Optional, Tuple, Union
from urllib import parse

import bsdiff4
import requests
from requests.auth import AuthBase
from tuf.api.exceptions import DownloadError, UnsignedMetadataError
import tuf.ngclient

from tufup.common import TargetMeta
from tufup.common import Patcher, TargetMeta
from tufup.utils.platform_specific import install_update

logger = logging.getLogger(__name__)

DEFAULT_EXTRACT_DIR = pathlib.Path(tempfile.gettempdir()) / 'tufup'
SUFFIX_FAILED = '.failed'
# do full update if patch-size/full-size > MAX_SIZE_RATIO
MAX_SIZE_RATIO = 0.8


class Client(tuf.ngclient.Updater):
Expand Down Expand Up @@ -209,7 +210,9 @@ def check_for_updates(
# is not available, we must do a full update)
self.new_targets = new_patches
no_patches = total_patch_size == 0
patch_too_big = total_patch_size > self.new_archive_info.length
patch_too_big = (
total_patch_size / self.new_archive_info.length > MAX_SIZE_RATIO
)
no_archive = not self.current_archive_local_path.exists()
if not patch or no_patches or patch_too_big or no_archive or abort_patch:
# fall back on full update
Expand Down Expand Up @@ -251,29 +254,29 @@ def _apply_updates(
Note this has a side-effect: if self.extract_dir is not specified,
an extract_dir is created in a platform-specific temporary location.
"""
# patch current archive (if we have patches) or use new full archive
archive_bytes = None
file_path = None
target = None
# either patch the current archive (if we have patches) or use new full archive
try:
for target, file_path in sorted(self.downloaded_target_files.items()):
if target.is_archive:
# just ensure the full archive file is available
assert len(self.downloaded_target_files) == 1, 'too many targets'
assert self.new_archive_local_path.exists(), 'new archive missing'
elif target.is_patch:
# create new archive by patching current archive (patches
# must be sorted by increasing version)
if archive_bytes is None:
archive_bytes = self.current_archive_local_path.read_bytes()
archive_bytes = bsdiff4.patch(archive_bytes, file_path.read_bytes())
if archive_bytes:
# verify the patched archive length and hash
self.new_archive_info.verify_length_and_hashes(data=archive_bytes)
# write the patched new archive
self.new_archive_local_path.write_bytes(archive_bytes)
if next(iter(self.downloaded_target_files.keys())).is_archive:
# full archive is available
if len(self.downloaded_target_files) != 1:
raise ValueError('there should be only one downloaded *archive*')
if not self.new_archive_local_path.exists():
raise FileNotFoundError('the new archive file does not exist')
else:
# reconstruct full archive from patch(es)
if not all(
target.is_patch for target in self.downloaded_target_files.keys()
):
raise ValueError('all downloaded targets must be patches')
Patcher.patch_and_verify(
src_path=self.current_archive_local_path,
dst_path=self.new_archive_local_path,
patch_targets=self.downloaded_target_files,
)
except Exception as e:
if target and file_path and file_path.exists():
# rename all failed targets in order to skip them (patches) or retry
# them (archive) on the next run
for target, file_path in self.downloaded_target_files.items():
renamed_path = file_path.replace(
file_path.with_suffix(file_path.suffix + SUFFIX_FAILED)
)
Expand Down
104 changes: 88 additions & 16 deletions src/tufup/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import gzip
import hashlib
import logging
import pathlib
import re
from typing import Optional, Union
from typing import Dict, Optional, Union

import bsdiff4
from packaging.version import Version, InvalidVersion
Expand Down Expand Up @@ -155,27 +157,97 @@ def compose_filename(cls, name: str, version: str, is_archive: bool):


class Patcher(object):
DEFAULT_HASH_ALGORITHM = 'sha256'

@staticmethod
def _get_tar_size_and_hash(
tar_content: Optional[bytes] = None, algorithm: str = DEFAULT_HASH_ALGORITHM
) -> dict:
"""
Determines the size and hash of the specified data.
Note we could also use tuf.api.metadata.TargetFile for this, but that we'll
keep this part independent from python-tuf, for clarity and flexibility.
"""
hash_obj = getattr(hashlib, algorithm)()
hash_obj.update(tar_content)
# hexdigest returns digest as string
return dict(
tar_size=len(tar_content),
tar_hash=hash_obj.hexdigest(),
tar_hash_algorithm=algorithm,
)

@classmethod
def create_patch(
cls, src_path: pathlib.Path, dst_path: pathlib.Path
) -> pathlib.Path:
def _verify_tar_size_and_hash(cls, tar_content: bytes, expected: dict):
"""
Create a binary patch file based on source and destination files.
Verifies that size and hash of data match the expected values.
Patch file path matches destination file path, except for suffix.
Raises an exception if this is not the case.
"""
# replace suffix twice, in case we have a .tar.gz
patch_path = dst_path.with_suffix('').with_suffix(SUFFIX_PATCH)
bsdiff4.file_diff(src_path=src_path, dst_path=dst_path, patch_path=patch_path)
return patch_path
result = cls._get_tar_size_and_hash(
tar_content=tar_content, algorithm=expected['tar_hash_algorithm']
)
for key in ['tar_size', 'tar_hash']:
if result[key] != expected[key]:
raise Exception(f'verification failed: {key} mismatch')

@classmethod
def apply_patch(cls, src_path: pathlib.Path, patch_path: pathlib.Path):
def diff_and_hash(
cls, src_path: pathlib.Path, dst_path: pathlib.Path, patch_path: pathlib.Path
) -> dict:
"""
Apply binary patch file to source file to create destination file.
Creates a patch file from the binary difference between source and destination
.tar archives. The source and destination files are expected to be
gzip-compressed tar archives (.tar.gz).
Destination file path matches patch file path, except for suffix.
Returns a dict with size and hash of the *uncompressed* destination archive.
"""
dst_path = patch_path.with_suffix(SUFFIX_ARCHIVE)
bsdiff4.file_patch(src_path=src_path, dst_path=dst_path, patch_path=patch_path)
return dst_path
with gzip.open(src_path, mode='rb') as src_file:
with gzip.open(dst_path, mode='rb') as dst_file:
dst_tar_content = dst_file.read()
patch_path.write_bytes(
bsdiff4.diff(src_bytes=src_file.read(), dst_bytes=dst_tar_content)
)
return cls._get_tar_size_and_hash(tar_content=dst_tar_content)

@classmethod
def patch_and_verify(
cls,
src_path: pathlib.Path,
dst_path: pathlib.Path,
patch_targets: Dict[TargetMeta, pathlib.Path],
) -> None:
"""
Applies one or more binary patch files to a source file in order to
reconstruct a destination file.
Source file and destination file are gzip-compressed tar archives, but the
patches are applied to the *uncompressed* tar archives. The reason is that
small changes in uncompressed data can cause (very) large differences in
gzip compressed data, leading to excessively large patch files (see #69).
The integrity of the patched .tar archive is verified using expected length
and hash (from custom tuf metadata), similar to python-tuf's download
verification. If the patched archive fails this check, the destination file
is not written.
"""
if not patch_targets:
raise ValueError('no patch targets')
# decompress .tar data from source .tar.gz file
with gzip.open(src_path, mode='rb') as src_file:
tar_bytes = src_file.read()
# apply cumulative patches (sorted by version, in ascending order)
for patch_meta, patch_path in sorted(patch_targets.items()):
logger.info(f'applying patch: {patch_meta.name}')
tar_bytes = bsdiff4.patch(
src_bytes=tar_bytes, patch_bytes=patch_path.read_bytes()
)
# verify integrity of the final result (raises exception on failure)
cls._verify_tar_size_and_hash(
tar_content=tar_bytes,
expected=patch_meta.custom, # noqa
)
# compress .tar data into destination .tar.gz file
with gzip.open(dst_path, mode='wb') as dst_file:
dst_file.write(tar_bytes)
50 changes: 31 additions & 19 deletions src/tufup/repo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
except AssertionError:
pass # assuming we are on the client side...
import shutil
import tarfile
from typing import Any, Dict, Iterable, List, Optional, TypedDict, Union

from securesystemslib.exceptions import CryptoError
Expand All @@ -37,7 +38,7 @@
)
from tuf.api.serialization.json import JSONSerializer

from tufup.common import Patcher, SUFFIX_ARCHIVE, SUFFIX_PATCH, TargetMeta
from tufup.common import Patcher, SUFFIX_PATCH, TargetMeta
from tufup.utils.platform_specific import _patched_resolve

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -79,12 +80,19 @@ def make_gztar_archive(
dst_dir: Union[pathlib.Path, str],
app_name: str,
version: str,
**kwargs, # allowed kwargs are passed on to shutil.make_archive
tar_format: int = tarfile.PAX_FORMAT,
) -> Optional[TargetMeta]:
# remove disallowed kwargs
for key in ['base_name', 'root_dir', 'format']:
if kwargs.pop(key, None):
logger.warning(f'{key} ignored: using default')
"""
Create a gzipped tar archive in the dst_dir, based on content of src_dir.
The PAX_FORMAT is currently the default tar format [1] used by the tarfile
module. For improved portability [2] and reproducibility [3], this can be changed
e.g. to USTAR_FORMAT.
[1]: https://www.gnu.org/software/tar/manual/html_node/Formats.html#Formats
[2]: https://www.gnu.org/software/tar/manual/html_node/Portability.html#Portability
[3]: https://www.gnu.org/software/tar/manual/html_node/Reproducibility.html#Reproducibility
"""
# ensure paths
src_dir = pathlib.Path(src_dir)
dst_dir = pathlib.Path(dst_dir)
Expand All @@ -97,15 +105,11 @@ def make_gztar_archive(
if input(f'Found existing archive: {archive_path}.\nOverwrite? [n]/y') != 'y':
print('Using existing archive.')
return TargetMeta(archive_path)
# make archive
base_name = str(dst_dir / archive_filename.replace(SUFFIX_ARCHIVE, ''))
archive_path_str = shutil.make_archive(
base_name=base_name, # archive file path, no suffix
root_dir=str(src_dir), # paths in archive will be relative to root_dir
format='gztar',
**kwargs,
)
return TargetMeta(target_path=archive_path_str)
# make gzipped tar archive
with tarfile.open(archive_path, mode='w:gz', format=tar_format) as tar:
# filter could be used in future versions to modify the tarinfo objects
tar.add(name=src_dir, arcname='.', recursive=True, filter=None)
return TargetMeta(target_path=archive_path)


class RolesDict(TypedDict):
Expand Down Expand Up @@ -383,6 +387,7 @@ def add_or_update_target(
target_file_path=url_path, local_path=str(local_path)
)
if custom:
# todo: handle creation of patch metadata here?
# todo: should we verify that custom is a dict?
target_file_info.unrecognized_fields['custom'] = custom
# note we assume self.targets has been initialized
Expand Down Expand Up @@ -765,11 +770,18 @@ def add_bundle(
)
# create patch, if possible, and register that too
if latest_archive and not skip_patch:
patch_path = Patcher.create_patch(
src_path=self.targets_dir / latest_archive.path,
dst_path=self.targets_dir / new_archive.path,
src_path = self.targets_dir / latest_archive.path
dst_path = self.targets_dir / new_archive.path
patch_path = dst_path.with_suffix('').with_suffix(SUFFIX_PATCH)
# create patch
dst_size_and_hash = Patcher.diff_and_hash(
src_path=src_path, dst_path=dst_path, patch_path=patch_path
)
# register patch (size and hash are used by the client to verify the
# integrity of the patched archive)
self.roles.add_or_update_target(
local_path=patch_path, custom=dst_size_and_hash
)
self.roles.add_or_update_target(local_path=patch_path)

def remove_latest_bundle(self):
"""
Expand Down
6 changes: 3 additions & 3 deletions tests/data/repository/metadata/1.root.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
"signatures": [
{
"keyid": "b7ad916e4138911155b771d0ede66666e9647e7fb6c85a1904be97dee5653568",
"sig": "8582f12a66a923c8069a4385ef594c345ca2bd69741c0ba2691c4cb20e005e7a771f6ca651852d1264d13107d108c5843d3f9b69bcd20500f7108cca6e6c8901"
"sig": "0f634a6e5f82af4447accce63c2987350c9c16fe6f8ce391ed504da106be8a127e1d606424c97a27822038cfd35e4daa96da2ec07a4a75bc2610df3bfc95cd0c"
},
{
"keyid": "d4ec748f9476f9f7e1f0a247b917dde4abe8a024de9ba34c7458b41bec8be6b2",
"sig": "3f2a6d6cd8232d0ca1f2b75445a7dc9bc4342f72fe88204fac7e7acad48eb6102ff1ba4b1efaf8f8ec32ee11cf68a5f92e34300f66b37e5970e878f77b2e9c0b"
"sig": "678256d67bcf6022f75920ff380dc2111e2d68120af834f1769d694665236a2c7fb57ea5731f4050e1562a8b2be870b6594a2203f52182b1b77fa98ae89ed90c"
}
],
"signed": {
"_type": "root",
"consistent_snapshot": false,
"expires": "2051-06-25T13:08:41Z",
"expires": "2051-06-27T21:21:03Z",
"keys": {
"5ef48ab6f5398d2bf17f1f4c4fc0e0440c4aa3734a05ae523561e02e8a99957a": {
"keytype": "ed25519",
Expand Down
12 changes: 6 additions & 6 deletions tests/data/repository/metadata/2.root.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"signatures": [
{
"keyid": "b7ad916e4138911155b771d0ede66666e9647e7fb6c85a1904be97dee5653568",
"sig": "740d4c6945050abd3abba7023cb5128a4e344e83ae0f52f9c978b7b3582dd21213e72a66dec6cd4206093c634cb973cf3ec0940103e54e6a81c4424322cf2d01"
"keyid": "1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003",
"sig": "47a42813ae34829c60539dcceba0d4b9a8a9286beaa8d5f07d3de3050d404426c22bc95b271e7c5e7ee529bc3180f009eb31313fb825f76c3ed9ca2c501bd503"
},
{
"keyid": "1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003",
"sig": "58ed242676830567413936feec20c80cd79d03fc31bdad38ffd0ef69e40298dfd8fe15edb7a4fd504a01ee5a7cddd3bfbd169ccd9bd2c6067e452aeee3a18102"
"keyid": "b7ad916e4138911155b771d0ede66666e9647e7fb6c85a1904be97dee5653568",
"sig": "421d85636350a89805abc4561acd3019ecf17246a37e91374a53276b5d56638c83754960c27d038c7d1193bdb33db12faf69b7a19099627c745c569093ee0005"
},
{
"keyid": "d4ec748f9476f9f7e1f0a247b917dde4abe8a024de9ba34c7458b41bec8be6b2",
"sig": "7ea041490934e6637998eb22ab367f1d260b3d0cdde144cc5a776dda7a65c27a6061d1b62986851ecbc49ad04c7a428987b323c1c961f65f8e0143c792deb706"
"sig": "a65dbf32349f1a57dd1dd6fc058c69a98be467f5ad408179da6e3b67abc6f2361415eb70214588d21079a9d0351500808f8c244b69f40b35a41999294461ca00"
}
],
"signed": {
"_type": "root",
"consistent_snapshot": false,
"expires": "2051-06-25T13:08:48Z",
"expires": "2051-06-27T21:21:13Z",
"keys": {
"1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003": {
"keytype": "ed25519",
Expand Down
12 changes: 6 additions & 6 deletions tests/data/repository/metadata/root.json
Original file line number Diff line number Diff line change
@@ -1,22 +1,22 @@
{
"signatures": [
{
"keyid": "b7ad916e4138911155b771d0ede66666e9647e7fb6c85a1904be97dee5653568",
"sig": "740d4c6945050abd3abba7023cb5128a4e344e83ae0f52f9c978b7b3582dd21213e72a66dec6cd4206093c634cb973cf3ec0940103e54e6a81c4424322cf2d01"
"keyid": "1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003",
"sig": "47a42813ae34829c60539dcceba0d4b9a8a9286beaa8d5f07d3de3050d404426c22bc95b271e7c5e7ee529bc3180f009eb31313fb825f76c3ed9ca2c501bd503"
},
{
"keyid": "1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003",
"sig": "58ed242676830567413936feec20c80cd79d03fc31bdad38ffd0ef69e40298dfd8fe15edb7a4fd504a01ee5a7cddd3bfbd169ccd9bd2c6067e452aeee3a18102"
"keyid": "b7ad916e4138911155b771d0ede66666e9647e7fb6c85a1904be97dee5653568",
"sig": "421d85636350a89805abc4561acd3019ecf17246a37e91374a53276b5d56638c83754960c27d038c7d1193bdb33db12faf69b7a19099627c745c569093ee0005"
},
{
"keyid": "d4ec748f9476f9f7e1f0a247b917dde4abe8a024de9ba34c7458b41bec8be6b2",
"sig": "7ea041490934e6637998eb22ab367f1d260b3d0cdde144cc5a776dda7a65c27a6061d1b62986851ecbc49ad04c7a428987b323c1c961f65f8e0143c792deb706"
"sig": "a65dbf32349f1a57dd1dd6fc058c69a98be467f5ad408179da6e3b67abc6f2361415eb70214588d21079a9d0351500808f8c244b69f40b35a41999294461ca00"
}
],
"signed": {
"_type": "root",
"consistent_snapshot": false,
"expires": "2051-06-25T13:08:48Z",
"expires": "2051-06-27T21:21:13Z",
"keys": {
"1bd53d9d6f08f6efba19477880b348906f5f29a67d78cbca8a44aedfad12d003": {
"keytype": "ed25519",
Expand Down
4 changes: 2 additions & 2 deletions tests/data/repository/metadata/snapshot.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
"signatures": [
{
"keyid": "5ef48ab6f5398d2bf17f1f4c4fc0e0440c4aa3734a05ae523561e02e8a99957a",
"sig": "29c6c8a45e7c0940e51cac1b9052304bb0baec1e1df35885522846ae5abd039c1846c453cd599ccc36e11c4f0a52de6b772d71627886e22dc77822b4404af602"
"sig": "73a146f5e1f12c0a36e88c8d7bf613baa1d528ea0c9480fe0d2ccd74d6da239da04470f68d283738194185cc82289c5f9f1312efea373b51dc8722965ca1fc0b"
}
],
"signed": {
"_type": "snapshot",
"expires": "2051-06-25T13:08:48Z",
"expires": "2051-06-27T21:21:13Z",
"meta": {
"targets.json": {
"version": 6
Expand Down
Loading

0 comments on commit 9910d05

Please sign in to comment.