Skip to content

Commit

Permalink
Provides an option not to ignore patterns like .git, hidden files and… (
Browse files Browse the repository at this point in the history
#4578)

* Provides an option not to ignore patterns like .git, hidden files and MACOSX files. Needed for the blob storage migration, because once upon a time these files were uploaded to bundles. Omitting them will cause discrepancy in bundles after the migration.

* formatting

* test modified

* Also updates zip_directory.
  • Loading branch information
HidyHan authored Nov 29, 2023
1 parent 984573d commit 128bf4d
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 9 deletions.
4 changes: 2 additions & 2 deletions codalab/migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def upload_to_azure_blob(self, bundle_uuid, bundle_location, is_dir=False):
)

if is_dir:
source_fileobj = zip_util.tar_gzip_directory(bundle_location)
source_fileobj = zip_util.tar_gzip_directory(bundle_location, exclude_patterns=None)
source_ext = ".tar.gz"
unpack = True
else:
Expand Down Expand Up @@ -164,7 +164,7 @@ def sanity_check(self, bundle_uuid, bundle_location, bundle_info, is_dir):
new_location = self.get_bundle_location(bundle_uuid)
if is_dir:
# For dirs, check the folder contains same files
with OpenFile(new_location, gzipped=True) as f:
with OpenFile(new_location, gzipped=True, exclude_patterns=None) as f:
new_file_list = tarfile.open(fileobj=f, mode='r:gz').getnames()
new_file_list.sort()

Expand Down
16 changes: 9 additions & 7 deletions codalab/worker/file_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
# from ratarmountcore import SQLiteIndexedTar, FileInfo
from ratarmountcore import FileInfo
from codalab.lib.beam.SQLiteIndexedTar import SQLiteIndexedTar # type: ignore
from typing import IO, cast
from typing import IO, List, Optional, cast

NONE_PLACEHOLDER = '<none>'

Expand Down Expand Up @@ -50,7 +50,7 @@ def get_path_exists(path):
def tar_gzip_directory(
directory_path,
follow_symlinks=False,
exclude_patterns=None,
exclude_patterns=ALWAYS_IGNORE_PATTERNS,
exclude_names=None,
ignore_file=None,
):
Expand Down Expand Up @@ -79,7 +79,6 @@ def tar_gzip_directory(
if not exclude_patterns:
exclude_patterns = []

exclude_patterns.extend(ALWAYS_IGNORE_PATTERNS)
for pattern in exclude_patterns:
args.append('--exclude=' + pattern)

Expand All @@ -99,7 +98,7 @@ def tar_gzip_directory(
def zip_directory(
directory_path,
follow_symlinks=False,
exclude_patterns=None,
exclude_patterns=ALWAYS_IGNORE_PATTERNS,
exclude_names=None,
ignore_file=None,
):
Expand Down Expand Up @@ -135,7 +134,6 @@ def zip_directory(
if not exclude_patterns:
exclude_patterns = []

exclude_patterns.extend(ALWAYS_IGNORE_PATTERNS)
for pattern in exclude_patterns:
args.append(f'--exclude=*{pattern}*')

Expand Down Expand Up @@ -241,8 +239,11 @@ class OpenFile(object):
path: str
mode: str
gzipped: bool
exclude_patterns: Optional[List[str]]

def __init__(self, path: str, mode='rb', gzipped=False):
def __init__(
self, path: str, mode='rb', gzipped=False, exclude_patterns=ALWAYS_IGNORE_PATTERNS
):
"""Initialize OpenFile.
Args:
Expand All @@ -255,6 +256,7 @@ def __init__(self, path: str, mode='rb', gzipped=False):
self.path = path
self.mode = mode
self.gzipped = gzipped
self.exclude_patterns = exclude_patterns

def __enter__(self) -> IO[bytes]:
linked_bundle_path = parse_linked_bundle_url(self.path)
Expand Down Expand Up @@ -296,7 +298,7 @@ def __enter__(self) -> IO[bytes]:
if os.path.isdir(self.path):
if not self.gzipped:
raise IOError("Directories must be gzipped.")
return tar_gzip_directory(self.path)
return tar_gzip_directory(self.path, exclude_patterns=self.exclude_patterns)
if self.gzipped:
raise IOError(
"Gzipping local files from disk from OpenFile is not yet supported. Please use file_util.gzip_file instead."
Expand Down
12 changes: 12 additions & 0 deletions tests/unit/worker/file_util_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,6 +418,18 @@ def archive(self, *args, **kwargs):
def unarchive(self, *args, **kwargs):
return un_tar_directory(*args, **kwargs)

def test_do_not_always_ignore(self):
temp_dir = tempfile.mkdtemp()
self.addCleanup(lambda: remove_path(temp_dir))
output_dir = os.path.join(temp_dir, 'output')

self.unarchive(self.archive(IGNORE_TEST_DIR, exclude_patterns=None), output_dir, 'gz')
output_dir_entries = os.listdir(output_dir)
self.assertNotIn('._ignored', output_dir_entries)
self.assertIn('dir', output_dir_entries)
self.assertIn('__MACOSX', output_dir_entries)
self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX')))


class ZipArchiveTest(ArchiveTestBase, unittest.TestCase):
"""Archive test for zip methods."""
Expand Down

0 comments on commit 128bf4d

Please sign in to comment.