From 86ff0288d452f01742010b5e50af62d4469ce092 Mon Sep 17 00:00:00 2001 From: Hidy Han Date: Wed, 15 Nov 2023 00:25:50 -0800 Subject: [PATCH 1/4] Provides an option not to ignore patterns like .git, hidden files and MACOSX files. Needed for the blob storage migration, because once upon a time these files were uploaded to bundles. Omitting them will cause discrepancy in bundles after the migration. --- codalab/migration.py | 5 +++-- codalab/worker/file_util.py | 11 ++++++----- tests/unit/worker/file_util_test.py | 18 ++++++++++++++++-- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/codalab/migration.py b/codalab/migration.py index a010865c8..cdf50f1ff 100644 --- a/codalab/migration.py +++ b/codalab/migration.py @@ -118,7 +118,8 @@ def upload_to_azure_blob(self, bundle_uuid, bundle_location, is_dir=False): ) if is_dir: - source_fileobj = zip_util.tar_gzip_directory(bundle_location) + source_fileobj = zip_util.tar_gzip_directory( + bundle_location, exclude_patterns=None) source_ext = ".tar.gz" unpack = True else: @@ -164,7 +165,7 @@ def sanity_check(self, bundle_uuid, bundle_location, bundle_info, is_dir): new_location = self.get_bundle_location(bundle_uuid) if is_dir: # For dirs, check the folder contains same files - with OpenFile(new_location, gzipped=True) as f: + with OpenFile(new_location, gzipped=True, exclude_patterns=None) as f: new_file_list = tarfile.open(fileobj=f, mode='r:gz').getnames() new_file_list.sort() diff --git a/codalab/worker/file_util.py b/codalab/worker/file_util.py index aca3a0a45..0b0dad83a 100644 --- a/codalab/worker/file_util.py +++ b/codalab/worker/file_util.py @@ -21,7 +21,7 @@ # from ratarmountcore import SQLiteIndexedTar, FileInfo from ratarmountcore import FileInfo from codalab.lib.beam.SQLiteIndexedTar import SQLiteIndexedTar # type: ignore -from typing import IO, cast +from typing import IO, cast, List NONE_PLACEHOLDER = '' @@ -50,7 +50,7 @@ def get_path_exists(path): def tar_gzip_directory( directory_path, follow_symlinks=False, - exclude_patterns=None, + exclude_patterns=ALWAYS_IGNORE_PATTERNS, exclude_names=None, ignore_file=None, ): @@ -79,7 +79,6 @@ def tar_gzip_directory( if not exclude_patterns: exclude_patterns = [] - exclude_patterns.extend(ALWAYS_IGNORE_PATTERNS) for pattern in exclude_patterns: args.append('--exclude=' + pattern) @@ -241,8 +240,9 @@ class OpenFile(object): path: str mode: str gzipped: bool + exclude_patterns: List[str] | None - def __init__(self, path: str, mode='rb', gzipped=False): + def __init__(self, path: str, mode='rb', gzipped=False, exclude_patterns=ALWAYS_IGNORE_PATTERNS): """Initialize OpenFile. Args: @@ -255,6 +255,7 @@ def __init__(self, path: str, mode='rb', gzipped=False): self.path = path self.mode = mode self.gzipped = gzipped + self.exclude_patterns = exclude_patterns def __enter__(self) -> IO[bytes]: linked_bundle_path = parse_linked_bundle_url(self.path) @@ -296,7 +297,7 @@ def __enter__(self) -> IO[bytes]: if os.path.isdir(self.path): if not self.gzipped: raise IOError("Directories must be gzipped.") - return tar_gzip_directory(self.path) + return tar_gzip_directory(self.path, exclude_patterns=self.exclude_patterns) if self.gzipped: raise IOError( "Gzipping local files from disk from OpenFile is not yet supported. Please use file_util.gzip_file instead." diff --git a/tests/unit/worker/file_util_test.py b/tests/unit/worker/file_util_test.py index 01719d375..2e416bf9a 100644 --- a/tests/unit/worker/file_util_test.py +++ b/tests/unit/worker/file_util_test.py @@ -407,8 +407,7 @@ def test_always_ignore(self): self.assertNotIn('__MACOSX', output_dir_entries) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2'))) - - + class TarArchiveTest(ArchiveTestBase, unittest.TestCase): """Archive test for tar/gzip methods.""" @@ -418,6 +417,21 @@ def archive(self, *args, **kwargs): def unarchive(self, *args, **kwargs): return un_tar_directory(*args, **kwargs) + def test_do_not_always_ignore(self): + temp_dir = tempfile.mkdtemp() + self.addCleanup(lambda: remove_path(temp_dir)) + output_dir = os.path.join(temp_dir, 'output') + + self.unarchive(self.archive(IGNORE_TEST_DIR, exclude_patterns=None), output_dir, 'gz') + output_dir_entries = os.listdir(output_dir) + self.assertNotIn('._ignored', output_dir_entries) + self.assertIn('dir', output_dir_entries) + self.assertIn('__MACOSX', output_dir_entries) + self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) + self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2'))) + + + class ZipArchiveTest(ArchiveTestBase, unittest.TestCase): """Archive test for zip methods.""" From 32b2b83dae246f02dc583416f6cd508032aafe69 Mon Sep 17 00:00:00 2001 From: Hidy Han Date: Wed, 15 Nov 2023 00:33:04 -0800 Subject: [PATCH 2/4] formatting --- codalab/migration.py | 3 +-- codalab/worker/file_util.py | 8 +++++--- tests/unit/worker/file_util_test.py | 5 ++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/codalab/migration.py b/codalab/migration.py index cdf50f1ff..87868f41c 100644 --- a/codalab/migration.py +++ b/codalab/migration.py @@ -118,8 +118,7 @@ def upload_to_azure_blob(self, bundle_uuid, bundle_location, is_dir=False): ) if is_dir: - source_fileobj = zip_util.tar_gzip_directory( - bundle_location, exclude_patterns=None) + source_fileobj = zip_util.tar_gzip_directory(bundle_location, exclude_patterns=None) source_ext = ".tar.gz" unpack = True else: diff --git a/codalab/worker/file_util.py b/codalab/worker/file_util.py index 0b0dad83a..e94494a50 100644 --- a/codalab/worker/file_util.py +++ b/codalab/worker/file_util.py @@ -21,7 +21,7 @@ # from ratarmountcore import SQLiteIndexedTar, FileInfo from ratarmountcore import FileInfo from codalab.lib.beam.SQLiteIndexedTar import SQLiteIndexedTar # type: ignore -from typing import IO, cast, List +from typing import IO, List, Optional, cast NONE_PLACEHOLDER = '' @@ -240,9 +240,11 @@ class OpenFile(object): path: str mode: str gzipped: bool - exclude_patterns: List[str] | None + exclude_patterns: Optional[List[str]] - def __init__(self, path: str, mode='rb', gzipped=False, exclude_patterns=ALWAYS_IGNORE_PATTERNS): + def __init__( + self, path: str, mode='rb', gzipped=False, exclude_patterns=ALWAYS_IGNORE_PATTERNS + ): """Initialize OpenFile. Args: diff --git a/tests/unit/worker/file_util_test.py b/tests/unit/worker/file_util_test.py index 2e416bf9a..3a74dc4b7 100644 --- a/tests/unit/worker/file_util_test.py +++ b/tests/unit/worker/file_util_test.py @@ -407,7 +407,8 @@ def test_always_ignore(self): self.assertNotIn('__MACOSX', output_dir_entries) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) self.assertFalse(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2'))) - + + class TarArchiveTest(ArchiveTestBase, unittest.TestCase): """Archive test for tar/gzip methods.""" @@ -431,8 +432,6 @@ def test_do_not_always_ignore(self): self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2'))) - - class ZipArchiveTest(ArchiveTestBase, unittest.TestCase): """Archive test for zip methods.""" From 0292de42d360565e969c82c9764d5d48e41ebc1c Mon Sep 17 00:00:00 2001 From: Hidy Han Date: Wed, 15 Nov 2023 00:53:39 -0800 Subject: [PATCH 3/4] test modified --- tests/unit/worker/file_util_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/worker/file_util_test.py b/tests/unit/worker/file_util_test.py index 3a74dc4b7..81fedf4e1 100644 --- a/tests/unit/worker/file_util_test.py +++ b/tests/unit/worker/file_util_test.py @@ -429,7 +429,6 @@ def test_do_not_always_ignore(self): self.assertIn('dir', output_dir_entries) self.assertIn('__MACOSX', output_dir_entries) self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '__MACOSX'))) - self.assertTrue(os.path.exists(os.path.join(output_dir, 'dir', '._ignored2'))) class ZipArchiveTest(ArchiveTestBase, unittest.TestCase): From fe0fc044657552a492329bbb6c6471340f672dee Mon Sep 17 00:00:00 2001 From: Hidy Han Date: Sun, 26 Nov 2023 17:51:15 -0800 Subject: [PATCH 4/4] Also updates zip_directory. --- codalab/worker/file_util.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/codalab/worker/file_util.py b/codalab/worker/file_util.py index e94494a50..25f0e2959 100644 --- a/codalab/worker/file_util.py +++ b/codalab/worker/file_util.py @@ -98,7 +98,7 @@ def tar_gzip_directory( def zip_directory( directory_path, follow_symlinks=False, - exclude_patterns=None, + exclude_patterns=ALWAYS_IGNORE_PATTERNS, exclude_names=None, ignore_file=None, ): @@ -134,7 +134,6 @@ def zip_directory( if not exclude_patterns: exclude_patterns = [] - exclude_patterns.extend(ALWAYS_IGNORE_PATTERNS) for pattern in exclude_patterns: args.append(f'--exclude=*{pattern}*')