Fix bug in archive subfolders and allow custom unpack locations (#224)

Archive members in subfolders were leading to FilenotFoundError because the processor didn't create the correct destination folders. This adds a check for this and creates the folders if missing. Also adds the `extract_dir` argument to `Unzip` and `Untar` to set a custom unpack folder (defaults to `None`).
fatiando · May 5, 2021 · 858d8f1 · 858d8f1
1 parent c782f18
commit 858d8f1
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 74 deletions.
diff --git a/AUTHORS.md b/AUTHORS.md
@@ -8,8 +8,9 @@ order by last name) and are considered "The Pooch Developers":
 * [Mathias Hauser](https://github.com/mathause) - Institute for Atmospheric and Climate Science, ETH Zurich, Zurich, Switzerland (ORCID: [0000-0002-0057-4878](https://orcid.org/0000-0002-0057-4878))
 * [Danilo Horta](https://github.com/horta) - EMBL-EBI, UK
 * [Hugo van Kemenade](https://github.com/hugovk) - Independent (Non-affiliated) (ORCID: [0000-0001-5715-8632](https://www.orcid.org/0000-0001-5715-8632))
-* [Kacper Kowalik](https://github.com/Xarthisius) - National Center for Supercomputing Applications, Univeristy of Illinois at Urbana-Champaign, USA (ORCID: [0000-0003-1709-3744](https://www.orcid.org/0000-0003-1709-3744))
+* [Kacper Kowalik](https://github.com/Xarthisius) - National Center for Supercomputing Applications, University of Illinois at Urbana-Champaign, USA (ORCID: [0000-0003-1709-3744](https://www.orcid.org/0000-0003-1709-3744))
 * [John Leeman](https://github.com/jrleeman)
+* [Daniel McCloy](https://github.com/drammock) - University of Washington, USA (ORCID: [0000-0002-7572-3241](https://orcid.org/0000-0002-7572-3241))
 * [Rémi Rampin](https://github.com/remram44) - New York University, USA (ORCID: [0000-0002-0524-2282](https://www.orcid.org/0000-0002-0524-2282))
 * [Daniel Shapero](https://github.com/danshapero) - Polar Science Center, University of Washington Applied Physics Lab, USA (ORCID: [0000-0002-3651-0649](https://www.orcid.org/0000-0002-3651-0649))
 * [Santiago Soler](https://github.com/santisoler) - CONICET, Argentina; Instituto Geofísico Sismológico Volponi, Universidad Nacional de San Juan, Argentina (ORCID: [0000-0001-9202-5317](https://www.orcid.org/0000-0001-9202-5317))

diff --git a/doc/processors.rst b/doc/processors.rst
@@ -67,7 +67,7 @@ For example, to extract a single file from a zip archive:
         Load a large zipped sample data as a pandas.DataFrame.
         """
         # Extract the file "actual-data-file.txt" from the archive
-        unpack =  Unzip(members=["actual-data-file.txt"])
+        unpack = Unzip(members=["actual-data-file.txt"])
         # Pass in the processor to unzip the data file
         fnames = GOODBOY.fetch("zipped-data-file.zip", processor=unpack)
         # Returns the paths of all extract members (in our case, only one)
@@ -77,18 +77,43 @@ For example, to extract a single file from a zip archive:
         data = pandas.read_csv(fname)
         return data
 
-Or to extract all files into a folder and return the path to each file:
+By default, the :class:`~pooch.Unzip` processor (and similarly the
+:class:`~pooch.Untar` processor) will create a new folder in the same location
+as the downloaded archive file, and give it the same name as the archive file
+with the suffix ``.unzip`` (or ``.untar``) appended. If you want to change the
+location of the unpacked files, you can provide a parameter ``extract_dir`` to
+the processor to tell it where you want to unpack the files:
+
+.. code:: python
+
+    from pooch import Untar
+
+
+    def fetch_and_unpack_tar_file():
+        """
+        Unpack a file from a tar archive to a custom subdirectory in the cache.
+        """
+        # Extract a single file from the archive, to a specific location
+        unpack_to_custom_dir = Untar(members=["actual-data-file.txt"],
+                                     extract_dir="custom_folder")
+        # Pass in the processor to untar the data file
+        fnames = GOODBOY.fetch("tarred-data-file.tar.gz", processor=unpack)
+        # Returns the paths of all extract members (in our case, only one)
+        fname = fnames[0]
+        return fname
+
+
+To extract all files into a folder and return the path to each file, simply
+omit the ``members`` parameter:
 
 .. code:: python
 
     def fetch_zipped_archive():
         """
         Load all files from a zipped archive.
         """
-        # Pass in the processor to unzip the data file
         fnames = GOODBOY.fetch("zipped-archive.zip", processor=Unzip())
-        data = [pandas.read_csv(fname) for fname in fnames]
-        return data
+        return fnames
 
 Use :class:`pooch.Untar` to do the exact same for tar archives (with optional
 compression).

diff --git a/pooch/processors.py b/pooch/processors.py
@@ -13,6 +13,7 @@
 import gzip
 import lzma
 import shutil
+from pathlib import Path
 from zipfile import ZipFile
 from tarfile import TarFile
 
@@ -40,8 +41,9 @@ class ExtractorProcessor:  # pylint: disable=too-few-public-methods
     # String appended to unpacked archive. To be implemented in subclass
     suffix = None
 
-    def __init__(self, members=None):
+    def __init__(self, members=None, extract_dir=None):
         self.members = members
+        self.extract_dir = extract_dir
 
     def __call__(self, fname, action, pooch):
         """
@@ -68,21 +70,25 @@ def __call__(self, fname, action, pooch):
             A list of the full path to all files in the extracted archive.
 
         """
-        if self.suffix is None:
+        if self.suffix is None and self.extract_dir is None:
             raise NotImplementedError(
-                "Derived classes must define the 'suffix' attribute."
+                "Derived classes must define either a 'suffix' attribute or "
+                "an 'extract_dir' attribute."
             )
-        extract_dir = fname + self.suffix
-        if action in ("update", "download") or not os.path.exists(extract_dir):
+        if self.extract_dir is None:
+            self.extract_dir = fname + self.suffix
+        else:
+            archive_dir = fname.rsplit(os.path.sep, maxsplit=1)[0]
+            self.extract_dir = os.path.join(archive_dir, self.extract_dir)
+        if action in ("update", "download") or not os.path.exists(self.extract_dir):
             # Make sure that the folder with the extracted files exists
-            if not os.path.exists(extract_dir):
-                os.makedirs(extract_dir)
-            self._extract_file(fname, extract_dir)
+            os.makedirs(self.extract_dir, exist_ok=True)
+            self._extract_file(fname, self.extract_dir)
         # Get a list of all file names (including subdirectories) in our folder
         # of unzipped files.
         fnames = [
             os.path.join(path, fname)
-            for path, _, files in os.walk(extract_dir)
+            for path, _, files in os.walk(self.extract_dir)
             for fname in files
         ]
         return fnames
@@ -112,6 +118,13 @@ class Unzip(ExtractorProcessor):  # pylint: disable=too-few-public-methods
         If None, will unpack all files in the zip archive. Otherwise, *members*
         must be a list of file names to unpack from the archive. Only these
         files will be unpacked.
+    extract_dir : str or None
+        If None, files will be unpacked to the default location (a folder in
+        the same location as the downloaded zip file, with the suffix
+        ``.unzip`` added). Otherwise, files will be unpacked to
+        ``extract_dir``, which is interpreted as a *relative path* (relative to
+        the cache location provided by :func:`pooch.retrieve` or
+        :meth:`pooch.Pooch.fetch`).
 
     """
 
@@ -134,6 +147,11 @@ def _extract_file(self, fname, extract_dir):
                     get_logger().info(
                         "Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
                     )
+                    # make sure the target folder exists for nested members
+                    parts = Path(member).parts
+                    if len(parts) > 1:
+                        full_dir_path = os.path.join(extract_dir, *parts[:-1])
+                        os.makedirs(full_dir_path, exist_ok=True)
                     # Extract the data file from within the archive
                     with zip_file.open(member) as data_file:
                         # Save it to our desired file name
@@ -159,6 +177,13 @@ class Untar(ExtractorProcessor):  # pylint: disable=too-few-public-methods
         If None, will unpack all files in the archive. Otherwise, *members*
         must be a list of file names to unpack from the archive. Only these
         files will be unpacked.
+    extract_dir : str or None
+        If None, files will be unpacked to the default location (a folder in
+        the same location as the downloaded tar file, with the suffix
+        ``.untar`` added). Otherwise, files will be unpacked to
+        ``extract_dir``, which is interpreted as a *relative path* (relative to
+        the cache location  provided by :func:`pooch.retrieve` or
+        :meth:`pooch.Pooch.fetch`).
     """
 
     suffix = ".untar"
@@ -180,6 +205,11 @@ def _extract_file(self, fname, extract_dir):
                     get_logger().info(
                         "Extracting '%s' from '%s' to '%s'", member, fname, extract_dir
                     )
+                    # make sure the target folder exists for nested members
+                    parts = Path(member).parts
+                    if len(parts) > 1:
+                        full_dir_path = os.path.join(extract_dir, *parts[:-1])
+                        os.makedirs(full_dir_path, exist_ok=True)
                     # Extract the data file from within the archive
                     # Python 2.7: extractfile doesn't return a context manager
                     data_file = tar_file.extractfile(member)

diff --git a/pooch/tests/test_processors.py b/pooch/tests/test_processors.py
@@ -110,77 +110,57 @@ def test_extractprocessor_fails():
         assert not exception.value.args
 
 
+@pytest.mark.parametrize("_dir", [None, "foo"], ids=["default_dir", "custom_dir"])
 @pytest.mark.parametrize(
-    "proc_cls,ext", [(Unzip, ".zip"), (Untar, ".tar.gz")], ids=["Unzip", "Untar"]
-)
-def test_processors(proc_cls, ext):
-    "Setup a hook and make sure it's only executed when downloading"
-    processor = proc_cls(members=["tiny-data.txt"])
-    suffix = proc_cls.suffix
-    extract_dir = "tiny-data" + ext + suffix
-    with TemporaryDirectory() as local_store:
-        path = Path(local_store)
-        true_path = str(path / extract_dir / "tiny-data.txt")
-        # Setup a pooch in a temp dir
-        pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
-        # Check the logs when downloading and from the processor
-        with capture_log() as log_file:
-            fnames = pup.fetch("tiny-data" + ext, processor=processor)
-            fname = fnames[0]
-            assert len(fnames) == 1
-            logs = log_file.getvalue()
-            lines = logs.splitlines()
-            assert len(lines) == 2
-            assert lines[0].split()[0] == "Downloading"
-            assert lines[-1].startswith("Extracting 'tiny-data.txt'")
-
-        assert fname == true_path
-        check_tiny_data(fname)
-        # Check that processor doesn't execute when not downloading
-        with capture_log() as log_file:
-            fnames = pup.fetch("tiny-data" + ext, processor=processor)
-            fname = fnames[0]
-            assert len(fnames) == 1
-            assert log_file.getvalue() == ""
-        assert fname == true_path
-        check_tiny_data(fname)
-
-
-@pytest.mark.parametrize(
-    "proc_cls,ext,msg",
+    "proc_cls,file_ext,msg",
     [(Unzip, ".zip", "Unzipping"), (Untar, ".tar.gz", "Untarring")],
     ids=["Unzip", "Untar"],
 )
-def test_processor_multiplefiles(proc_cls, ext, msg):
-    "Setup a processor to unzip/untar a file and return multiple fnames"
-    processor = proc_cls()
-    suffix = proc_cls.suffix
-    extract_dir = "store" + ext + suffix
+@pytest.mark.parametrize(
+    "archive_basename,members",
+    [
+        ("tiny-data", ["tiny-data.txt"]),  # 1 compressed file
+        ("store", None),  # all files in an archive
+        ("store", ["store/subdir/tiny-data.txt"]),  # 1 file nested in archive
+    ],
+    ids=["onefile", "all_files", "nested"],
+)
+def test_processors(_dir, proc_cls, file_ext, msg, archive_basename, members):
+    "Setup a hook and make sure it's only executed when downloading"
+    processor = proc_cls(members=members, extract_dir=_dir)
+    _dir = archive_basename + file_ext + proc_cls.suffix if _dir is None else _dir
     with TemporaryDirectory() as local_store:
         path = Path(local_store)
-        true_paths = {
-            str(path / extract_dir / "store" / "tiny-data.txt"),
-            str(path / extract_dir / "store" / "subdir" / "tiny-data.txt"),
-        }
+        true_paths = [
+            str(path / _dir / "tiny-data.txt"),
+            str(path / _dir / "store" / "tiny-data.txt"),
+            str(path / _dir / "store" / "subdir" / "tiny-data.txt"),
+        ]
+        if archive_basename == "tiny-data":
+            true_paths = set(true_paths[:1])
+            log_line = "Extracting 'tiny-data.txt'"
+        elif members is None:
+            true_paths = set(true_paths[1:])
+            log_line = f"{msg} contents"
+        else:
+            true_paths = set(true_paths[-1:])
+            log_line = "Extracting 'store/subdir/tiny-data.txt'"
         # Setup a pooch in a temp dir
         pup = Pooch(path=path, base_url=BASEURL, registry=REGISTRY)
         # Check the logs when downloading and from the processor
         with capture_log() as log_file:
-            fnames = pup.fetch("store" + ext, processor=processor)
-            logs = log_file.getvalue()
-            lines = logs.splitlines()
+            fnames = pup.fetch(archive_basename + file_ext, processor=processor)
+            assert set(fnames) == true_paths
+            lines = log_file.getvalue().splitlines()
             assert len(lines) == 2
             assert lines[0].split()[0] == "Downloading"
-            assert lines[-1].startswith(f"{msg} contents")
-            assert len(fnames) == 2
-            assert true_paths == set(fnames)
-            for fname in fnames:
-                check_tiny_data(fname)
+            assert lines[-1].startswith(log_line)
+        for fname in fnames:
+            check_tiny_data(fname)
         # Check that processor doesn't execute when not downloading
         with capture_log() as log_file:
-            fnames = pup.fetch("store" + ext, processor=processor)
+            fnames = pup.fetch(archive_basename + file_ext, processor=processor)
+            assert set(fnames) == true_paths
             assert log_file.getvalue() == ""
-            assert len(fnames) == 2
-            assert true_paths == set(fnames)
-            for fname in fnames:
-                check_tiny_data(fname)
+        for fname in fnames:
+            check_tiny_data(fname)