From 143bb550245aa51414d6a70de09e829287682cfb Mon Sep 17 00:00:00 2001 From: Marek Goldmann Date: Thu, 1 Sep 2016 13:08:29 +0200 Subject: [PATCH] Files added when parent directory is a symlink When we add a file back to the tar we need to check if the parent (or some parent of it) is a symlink. We should not add any files in such case. Fixes #122. --- docker_squash/image.py | 58 ++++++++++++++++++++++++++----------- tests/test_integ_squash.py | 30 +++++++++++++++++-- tests/test_unit_v1_image.py | 28 ++++++++++++++---- 3 files changed, 92 insertions(+), 24 deletions(-) diff --git a/docker_squash/image.py b/docker_squash/image.py index df5dc27..3e2f4cd 100644 --- a/docker_squash/image.py +++ b/docker_squash/image.py @@ -480,7 +480,7 @@ def _marker_files(self, tar, members): return marker_files - def _add_markers(self, markers, tar, files_in_layers): + def _add_markers(self, markers, tar, files_in_layers, added_symlinks): """ This method is responsible for adding back all markers that were not added to the squashed layer AND files they refer to can be found in layers @@ -503,11 +503,17 @@ def _add_markers(self, markers, tar, files_in_layers): for marker, marker_file in six.iteritems(markers): actual_file = marker.name.replace('.wh.', '') normalized_file = self._normalize_path(actual_file) + should_be_added_back = False + if self._file_should_be_skipped(normalized_file, added_symlinks): + self.log.debug( + "Skipping '%s' marker file, this file is on a symlink path" % normalized_file) + continue + if normalized_file in tar_files: self.log.debug( - "Skipping '%s' marker file, this file was added earlier for some reason..." % marker.name) + "Skipping '%s' marker file, this file was added earlier for some reason..." % normalized_file) continue if files_in_layers: @@ -541,16 +547,18 @@ def _normalize_path(self, path): def _add_hardlinks(self, squashed_tar, squashed_files, to_skip, skipped_hard_links): for layer, hardlinks_in_layer in enumerate(skipped_hard_links): # We need to start from 1, that's why we bump it here - current_layer = layer+1 + current_layer = layer + 1 for member in six.itervalues(hardlinks_in_layer): normalized_name = self._normalize_path(member.name) normalized_linkname = self._normalize_path(member.linkname) # Find out if the name is on the list of files to skip - if it is - get the layer number # where it was found - layer_skip_name = self._file_should_be_skipped(normalized_name, to_skip) + layer_skip_name = self._file_should_be_skipped( + normalized_name, to_skip) # Do the same for linkname - layer_skip_linkname = self._file_should_be_skipped(normalized_linkname, to_skip) + layer_skip_linkname = self._file_should_be_skipped( + normalized_linkname, to_skip) # We need to check if we should skip adding back the hard link # This can happen in the following situations: @@ -573,7 +581,8 @@ def _add_file(self, member, content, squashed_tar, squashed_files, to_skip): normalized_name = self._normalize_path(member.name) if normalized_name in squashed_files: - self.log.debug("Skipping file '%s' because it is already squashed" % normalized_name) + self.log.debug( + "Skipping file '%s' because it is already squashed" % normalized_name) return if self._file_should_be_skipped(normalized_name, to_skip): @@ -592,9 +601,10 @@ def _add_file(self, member, content, squashed_tar, squashed_files, to_skip): squashed_files.append(normalized_name) def _add_symlinks(self, squashed_tar, squashed_files, to_skip, skipped_sym_links): + added_symlinks = [] for layer, symlinks_in_layer in enumerate(skipped_sym_links): # We need to start from 1, that's why we bump it here - current_layer = layer+1 + current_layer = layer + 1 for member in six.itervalues(symlinks_in_layer): # Handling symlinks. This is similar to hard links with one @@ -606,16 +616,24 @@ def _add_symlinks(self, squashed_tar, squashed_files, to_skip, skipped_sym_links # File is already in squashed files, skipping if normalized_name in squashed_files: - self.log.debug("Found a symbolic link '%s' which is already squashed, skipping" % (normalized_name)) + self.log.debug( + "Found a symbolic link '%s' which is already squashed, skipping" % (normalized_name)) continue + if self._file_should_be_skipped(normalized_name, added_symlinks): + self.log.debug( + "Found a symbolic link '%s' which is on a path to previously squashed symlink, skipping" % (normalized_name)) + continue # Find out if the name is on the list of files to skip - if it is - get the layer number # where it was found - layer_skip_name = self._file_should_be_skipped(normalized_name, to_skip) + layer_skip_name = self._file_should_be_skipped( + normalized_name, to_skip) # Do the same for linkname - layer_skip_linkname = self._file_should_be_skipped(normalized_linkname, to_skip) + layer_skip_linkname = self._file_should_be_skipped( + normalized_linkname, to_skip) - # If name or linkname was found in the lists of files to be skipped or it's not found in the squashed files + # If name or linkname was found in the lists of files to be + # skipped or it's not found in the squashed files if layer_skip_name and current_layer > layer_skip_name or layer_skip_linkname and current_layer > layer_skip_linkname: self.log.debug("Found a symbolic link '%s' to a file which is marked to be skipped: '%s', skipping link too" % ( normalized_name, normalized_linkname)) @@ -624,11 +642,13 @@ def _add_symlinks(self, squashed_tar, squashed_files, to_skip, skipped_sym_links self.log.debug("Adding symbolic link '%s' pointing to '%s' back..." % ( normalized_name, normalized_linkname)) - to_skip.append([normalized_name]) + added_symlinks.append([normalized_name]) squashed_files.append(normalized_name) squashed_tar.addfile(member) + return added_symlinks + def _squash_layers(self, layers_to_squash, layers_to_move): self.log.info("Starting squashing...") @@ -744,20 +764,24 @@ def _squash_layers(self, layers_to_squash, layers_to_move): if member.isfile(): content = layer_tar.extractfile(member) - self._add_file(member, content, squashed_tar, squashed_files, to_skip) + self._add_file(member, content, + squashed_tar, squashed_files, to_skip) skipped_hard_links.append(skipped_hard_link_files) skipped_files.append(skipped_files_in_layer) - self._add_hardlinks(squashed_tar, squashed_files, to_skip, skipped_hard_links) - self._add_symlinks(squashed_tar, squashed_files, to_skip, skipped_sym_links) + self._add_hardlinks(squashed_tar, squashed_files, + to_skip, skipped_hard_links) + added_symlinks = self._add_symlinks( + squashed_tar, squashed_files, to_skip, skipped_sym_links) for layer in skipped_files: for member, content in six.itervalues(layer): - self._add_file(member, content, squashed_tar, squashed_files, to_skip) + self._add_file(member, content, squashed_tar, + squashed_files, added_symlinks) if files_in_layers_to_move: self._add_markers(skipped_markers, squashed_tar, - files_in_layers_to_move) + files_in_layers_to_move, added_symlinks) self.log.info("Squashing finished!") diff --git a/tests/test_integ_squash.py b/tests/test_integ_squash.py index 9e65d78..aee5dce 100644 --- a/tests/test_integ_squash.py +++ b/tests/test_integ_squash.py @@ -73,7 +73,7 @@ def __enter__(self): f = BytesIO(self.dockerfile.encode('utf-8')) for line in self.docker.build(fileobj=f, tag=self.tag, rm=True): try: - print(json.loads(line)["stream"].strip()) + print(json.loads(line.decode("utf-8"))["stream"].strip()) except: print(line) @@ -909,7 +909,6 @@ def test_should_handle_symlinks_to_directory(self): with self.Image(dockerfile) as image: with self.SquashedImage(image, 3, numeric=True) as squashed_image: - with self.Container(squashed_image) as container: container.assertFileExists('data-template') container.assertFileExists('data-template/tmp') @@ -918,6 +917,33 @@ def test_should_handle_symlinks_to_directory(self): container.assertFileExists('tmp/dir') container.assertFileDoesNotExist('tmp/dir/file') + # https://github.com/goldmann/docker-squash/issues/122 + def test_should_not_add_duplicate_files(self): + dockerfile = ''' + FROM {} + RUN mkdir -p /etc/systemd/system/multi-user.target.wants + RUN mkdir -p /etc/systemd/system/default.target.wants + RUN touch /etc/systemd/system/multi-user.target.wants/remote-fs.target + RUN touch /etc/systemd/system/default.target.wants/remote-fs.target + # End of preparations, going to squash from here + RUN find /etc/systemd/system/* '!' -name '*.wants' | xargs rm -rvf + RUN rmdir -v /etc/systemd/system/multi-user.target.wants && mkdir /etc/systemd/system/container-ipa.target.wants && ln -s /etc/systemd/system/container-ipa.target.wants /etc/systemd/system/multi-user.target.wants + RUN ln -s /etc/group /etc/systemd/system/default.target + RUN ln -s /etc/group /etc/systemd/system/container-ipa.target.wants/ipa-server-configure-first.service + RUN echo "/etc/systemd/system" > /etc/volume-data-list + RUN set -e ; cd / ; mkdir /data-template ; cat /etc/volume-data-list | while read i ; do echo $i ; if [ -e $i ] ; then tar cf - .$i | ( cd /data-template && tar xf - ) ; fi ; mkdir -p $( dirname $i ) ; if [ "$i" == /var/log/ ] ; then mv /var/log /var/log-removed ; else rm -rf $i ; fi ; ln -sf /data$i $i ; done + '''.format(TestIntegSquash.BUSYBOX_IMAGE) + + with self.Image(dockerfile) as image: + with self.SquashedImage(image, 6, numeric=True, output_path="tox.tar") as squashed_image: + with self.Container(squashed_image) as container: + container.assertFileExists('data-template/etc/systemd/system/container-ipa.target.wants') + container.assertFileExists('data-template/etc/systemd/system/default.target.wants') + container.assertFileExists('data-template/etc/systemd/system/default.target') + container.assertFileExists('data-template/etc/systemd/system/multi-user.target.wants') + container.assertFileExists('data-template/etc/systemd/system/container-ipa.target.wants/ipa-server-configure-first.service') + container.assertFileExists('etc/systemd/system') + class NumericValues(IntegSquash): @classmethod diff --git a/tests/test_unit_v1_image.py b/tests/test_unit_v1_image.py index cc3b8e1..198a795 100644 --- a/tests/test_unit_v1_image.py +++ b/tests/test_unit_v1_image.py @@ -226,7 +226,7 @@ def setUp(self): self.squash = Image(self.log, self.docker_client, self.image, None) def test_should_not_fail_with_empty_list_of_markers_to_add(self): - self.squash._add_markers({}, None, None) + self.squash._add_markers({}, None, None, []) def test_should_add_all_marker_files_to_empty_tar(self): tar = mock.Mock() @@ -236,7 +236,7 @@ def test_should_add_all_marker_files_to_empty_tar(self): type(marker_1).name = mock.PropertyMock(return_value='.wh.marker_1') markers = {marker_1: 'file'} - self.squash._add_markers(markers, tar, {}) + self.squash._add_markers(markers, tar, {}, []) self.assertTrue(len(tar.addfile.mock_calls) == 1) tar_info, marker_file = tar.addfile.call_args[0] @@ -244,6 +244,24 @@ def test_should_add_all_marker_files_to_empty_tar(self): self.assertTrue(marker_file == 'file') self.assertTrue(tar_info.isfile()) + def test_should_add_all_marker_files_to_empty_tar_besides_what_should_be_skipped(self): + tar = mock.Mock() + tar.getnames.return_value = [] + + marker_1 = mock.Mock() + type(marker_1).name = mock.PropertyMock(return_value='.wh.marker_1') + marker_2 = mock.Mock() + type(marker_2).name = mock.PropertyMock(return_value='.wh.marker_2') + + markers = {marker_1: 'file1', marker_2: 'file2'} + self.squash._add_markers(markers, tar, {'1234layerdid': ['/marker_1', '/marker_2']}, [['/marker_1']]) + + self.assertEqual(len(tar.addfile.mock_calls), 1) + tar_info, marker_file = tar.addfile.call_args[0] + self.assertIsInstance(tar_info, tarfile.TarInfo) + self.assertTrue(marker_file == 'file2') + self.assertTrue(tar_info.isfile()) + def test_should_skip_a_marker_file_if_file_is_in_unsquashed_layers(self): tar = mock.Mock() # List of files in the squashed tar @@ -257,7 +275,7 @@ def test_should_skip_a_marker_file_if_file_is_in_unsquashed_layers(self): markers = {marker_1: 'marker_1', marker_2: 'marker_2'} # List of files in all layers to be moved files_in_moved_layers = {'1234layerdid': ['/some/file', '/marker_2']} - self.squash._add_markers(markers, tar, files_in_moved_layers) + self.squash._add_markers(markers, tar, files_in_moved_layers, []) self.assertEqual(len(tar.addfile.mock_calls), 1) tar_info, marker_file = tar.addfile.call_args[0] @@ -275,7 +293,7 @@ def test_should_not_add_any_marker_files(self): type(marker_2).name = mock.PropertyMock(return_value='.wh.marker_2') markers = {marker_1: 'file1', marker_2: 'file2'} - self.squash._add_markers(markers, tar, {'1234layerdid': ['some/file', 'marker_1', 'marker_2']}) + self.squash._add_markers(markers, tar, {'1234layerdid': ['some/file', 'marker_1', 'marker_2']}, []) self.assertTrue(len(tar.addfile.mock_calls) == 0) @@ -293,7 +311,7 @@ def test_should_add_marker_file_when_tar_has_prefixed_entries(self): markers = {marker_1: 'filecontent1', marker_2: 'filecontent2'} # List of layers to move (and files in these layers), already normalized - self.squash._add_markers(markers, tar, {'1234layerdid': ['/some/file', '/other/file', '/stuff']}) + self.squash._add_markers(markers, tar, {'1234layerdid': ['/some/file', '/other/file', '/stuff']}, []) self.assertEqual(len(tar.addfile.mock_calls), 1) tar_info, marker_file = tar.addfile.call_args[0]