Skip to content

Commit

Permalink
Merge pull request #35 from jaraco/feature/linear-lookup-py2
Browse files Browse the repository at this point in the history
Linear lookup for Python 2
  • Loading branch information
jaraco authored Jan 25, 2020
2 parents 5cbb776 + a122aaa commit 088b0cf
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 46 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ language: python

python:
- 2.7
- 3.5
- &latest_py3 3.7
- 3.6
- &latest_py3 3.8

jobs:
fast_finish: true
Expand Down
9 changes: 9 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
v1.1.0
======

#32: For read-only zip files, complexity of ``.exists`` and
``joinpath`` is now constant time instead of ``O(n)``, preventing
quadratic time in common use-cases and rendering large
zip files unusable for Path. Big thanks to Benjy Weinberger
for the bug report and contributed fix (#33).

v1.0.0
======

Expand Down
4 changes: 2 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ packages = find:
include_package_data = true
python_requires = >=2.7
install_requires =
more_itertools
contextlib2; python_version < "3.4"
setup_requires = setuptools_scm >= 1.15.0

[options.extras_require]
Expand All @@ -31,8 +31,8 @@ testing =

# local
pathlib2
contextlib2
unittest2
jaraco.itertools

docs =
# upstream
Expand Down
44 changes: 42 additions & 2 deletions test_zipp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

import io
import zipfile
import posixpath
import contextlib
import tempfile
import shutil
Expand All @@ -25,6 +24,8 @@
except AttributeError:
import unittest2 as unittest

import jaraco.itertools

import zipp

__metaclass__ = type
Expand All @@ -36,7 +37,7 @@ def add_dirs(zf):
Given a writable zip file zf, inject directory entries for
any directories implied by the presence of children.
"""
for name in zipp.Path._implied_dirs(zf.namelist()):
for name in zipp.CompleteDirs._implied_dirs(zf.namelist()):
zf.writestr(name, b"")
return zf

Expand Down Expand Up @@ -196,3 +197,42 @@ def test_missing_dir_parent(self):
for alpharep in self.zipfile_alpharep():
root = zipp.Path(alpharep)
assert (root / 'missing dir/').parent.at == ''

def test_mutability(self):
"""
If the underlying zipfile is changed, the Path object should
reflect that change.
"""
for alpharep in self.zipfile_alpharep():
root = zipp.Path(alpharep)
a, b, g = root.iterdir()
alpharep.writestr('foo.txt', b'foo')
alpharep.writestr('bar/baz.txt', b'baz')
assert any(
child.name == 'foo.txt'
for child in root.iterdir())
assert (root / 'foo.txt').read_text() == 'foo'
baz, = (root / 'bar').iterdir()
assert baz.read_text() == 'baz'

HUGE_ZIPFILE_NUM_ENTRIES = 2 ** 13

def huge_zipfile(self):
"""Create a read-only zipfile with a huge number of entries entries."""
strm = io.BytesIO()
zf = zipfile.ZipFile(strm, "w")
for entry in map(str, range(self.HUGE_ZIPFILE_NUM_ENTRIES)):
zf.writestr(entry, entry)
zf.mode = 'r'
return zf

def test_joinpath_constant_time(self):
"""
Ensure joinpath on items in zipfile is linear time.
"""
root = zipp.Path(self.huge_zipfile())
entries = jaraco.itertools.Counter(root.iterdir())
for entry in entries:
entry.joinpath('suffix')
# Check the file iterated all items
assert entries.count == self.HUGE_ZIPFILE_NUM_ENTRIES
134 changes: 94 additions & 40 deletions zipp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
import zipfile
import functools
import itertools
from collections import OrderedDict

import more_itertools
try:
from contextlib import suppress
except ImportError:
from contextlib2 import suppress

__metaclass__ = type

Expand Down Expand Up @@ -55,6 +59,90 @@ def _ancestry(path):
path, tail = posixpath.split(path)


class CompleteDirs(zipfile.ZipFile):
"""
A ZipFile subclass that ensures that implied directories
are always included in the namelist.
"""

@staticmethod
def _implied_dirs(names):
parents = itertools.chain.from_iterable(map(_parents, names))
# Deduplicate entries in original order
implied_dirs = OrderedDict.fromkeys(
p + posixpath.sep for p in parents
# Cast names to a set for O(1) lookups
if p + posixpath.sep not in set(names)
)
return implied_dirs

def namelist(self):
names = super(CompleteDirs, self).namelist()
return names + list(self._implied_dirs(names))

def _name_set(self):
return set(self.namelist())

def resolve_dir(self, name):
"""
If the name represents a directory, return that name
as a directory (with the trailing slash).
"""
names = self._name_set()
dirname = name + '/'
dir_match = name not in names and dirname in names
return dirname if dir_match else name

@classmethod
def make(cls, source):
"""
Given a source (filename or zipfile), return an
appropriate CompleteDirs subclass.
"""
if isinstance(source, CompleteDirs):
return source

if not isinstance(source, zipfile.ZipFile):
return cls(_pathlib_compat(source))

# Only allow for FastPath when supplied zipfile is read-only
if 'r' not in source.mode:
cls = CompleteDirs

res = cls.__new__(cls)
vars(res).update(vars(source))
return res


class FastLookup(CompleteDirs):
"""
ZipFile subclass to ensure implicit
dirs exist and are resolved rapidly.
"""
def namelist(self):
with suppress(AttributeError):
return self.__names
self.__names = super(FastLookup, self).namelist()
return self.__names

def _name_set(self):
with suppress(AttributeError):
return self.__lookup
self.__lookup = super(FastLookup, self)._name_set()
return self.__lookup


def _pathlib_compat(path):
"""
For path-like objects, convert to a filename for compatibility
on Python 3.6.1 and earlier.
"""
try:
return path.__fspath__()
except AttributeError:
return str(path)


class Path:
"""
A pathlib-compatible interface for zip files.
Expand Down Expand Up @@ -123,24 +211,9 @@ class Path:
__repr = "{self.__class__.__name__}({self.root.filename!r}, {self.at!r})"

def __init__(self, root, at=""):
self.root = (
root
if isinstance(root, zipfile.ZipFile)
else zipfile.ZipFile(self._pathlib_compat(root))
)
self.root = FastLookup.make(root)
self.at = at

@staticmethod
def _pathlib_compat(path):
"""
For path-like objects, convert to a filename for compatibility
on Python 3.6.1 and earlier.
"""
try:
return path.__fspath__()
except AttributeError:
return str(path)

@property
def open(self):
return functools.partial(self.root.open, self.at)
Expand Down Expand Up @@ -170,12 +243,12 @@ def is_file(self):
return not self.is_dir()

def exists(self):
return self.at in self._names()
return self.at in self.root._name_set()

def iterdir(self):
if not self.is_dir():
raise ValueError("Can't listdir a file")
subs = map(self._next, self._names())
subs = map(self._next, self.root.namelist())
return filter(self._is_child, subs)

def __str__(self):
Expand All @@ -185,36 +258,17 @@ def __repr__(self):
return self.__repr.format(self=self)

def joinpath(self, add):
add = self._pathlib_compat(add)
next = posixpath.join(self.at, add)
next_dir = posixpath.join(self.at, add, "")
names = self._names()
return self._next(next_dir if next not in names and next_dir in names else next)
next = posixpath.join(self.at, _pathlib_compat(add))
return self._next(self.root.resolve_dir(next))

__truediv__ = joinpath

@staticmethod
def _implied_dirs(names):
return more_itertools.unique_everseen(
parent + "/"
for name in names
for parent in _parents(name)
if parent + "/" not in names
)

@classmethod
def _add_implied_dirs(cls, names):
return names + list(cls._implied_dirs(names))

@property
def parent(self):
parent_at = posixpath.dirname(self.at.rstrip('/'))
if parent_at:
parent_at += '/'
return self._next(parent_at)

def _names(self):
return self._add_implied_dirs(self.root.namelist())

if sys.version_info < (3,):
__div__ = __truediv__

0 comments on commit 088b0cf

Please sign in to comment.