Normalize path (#86)

* Add a test for nonascii query part * Fix #84: normalize path by handling . and .. * Fix with_path * Add missing test file * Improve coverage
aio-libs · Jun 24, 2017 · 3d0e035 · 3d0e035
1 parent 14d78e3
commit 3d0e035
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 11 deletions.
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -1,6 +1,11 @@
 CHANGES
 =======
 
+0.10.3 (2017-06-13)
+-------------------
+
+* Prevent double URL args unquoting #83
+
 0.10.2 (2017-05-05)
 -------------------
 

diff --git a/tests/test_normalize_path.py b/tests/test_normalize_path.py
@@ -0,0 +1,21 @@
+from yarl import _normalize_path as np
+
+
+def test_no_dots():
+    assert np('path/to') == 'path/to'
+
+
+def test_skip_dots():
+    assert np('path/./to') == 'path/to'
+
+
+def test_dot_at_end():
+    assert np('path/to/.') == 'path/to/'
+
+
+def test_double_dots():
+    assert np('path/../to') == 'to'
+
+
+def test_extra_double_dots():
+    assert np('path/../../to') == 'to'
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -44,6 +44,11 @@ def test_origin_no_scheme():
         url.origin()
 
 
+def test_drop_dots():
+    u = URL('http://example.com/path/../to')
+    assert str(u) == 'http://example.com/to'
+
+
 def test_abs_cmp():
     assert URL('http://example.com:8888') == URL('http://example.com:8888')
     assert URL('http://example.com:8888/') == URL('http://example.com:8888/')
@@ -246,6 +251,11 @@ def test_query_dont_unqoute_twice():
     assert url.query['url'] == sample_url
 
 
+def test_query_nonascii():
+    url = URL('http://example.com?ключ=знач')
+    assert url.query == MultiDict({'ключ': 'знач'})
+
+
 def test_raw_fragment_empty():
     url = URL('http://example.com')
     assert '' == url.raw_fragment
@@ -490,6 +500,11 @@ def test_div_with_colon_and_at():
     assert url.raw_path == '/base/path:abc@123'
 
 
+def test_div_with_dots():
+    url = URL('http://example.com/base') / '../path/./to'
+    assert url.raw_path == '/path/to'
+
+
 # comparison and hashing
 
 def test_ne_str():
@@ -728,6 +743,8 @@ def test_with_port_invalid_type():
     with pytest.raises(TypeError):
         URL('http://example.com').with_port('123')
 
+# with_path
+
 
 def test_with_path():
     url = URL('http://example.com')
@@ -741,6 +758,18 @@ def test_with_path_encoded():
                ) == 'http://example.com/test'
 
 
+def test_with_path_dots():
+    url = URL('http://example.com')
+    assert str(url.with_path('/test/.')) == 'http://example.com/test/'
+
+
+def test_with_path_relative():
+    url = URL('/path')
+    assert str(url.with_path('/new')) == '/new'
+
+
+# with_query
+
 def test_with_query():
     url = URL('http://example.com')
     assert str(url.with_query({'a': '1'})) == 'http://example.com/?a=1'
@@ -910,6 +939,8 @@ def test_with_fragment_bad_type():
     with pytest.raises(TypeError):
         url.with_fragment(123)
 
+# with_name
+
 
 def test_with_name():
     url = URL('http://example.com/a/b')
@@ -975,6 +1006,16 @@ def test_with_name_within_colon_and_at():
     url = URL('http://example.com/oldpath').with_name('path:abc@123')
     assert url.raw_path == '/path:abc@123'
 
+
+def test_with_name_dot():
+    with pytest.raises(ValueError):
+        URL('http://example.com').with_name('.')
+
+
+def test_with_name_double_dot():
+    with pytest.raises(ValueError):
+        URL('http://example.com').with_name('..')
+
 # is_absolute
 
 
@@ -1544,3 +1585,12 @@ def test_build_query_quoting():
     assert u == URL('http://127.0.0.1/файл.jpg?arg=Привет')
     assert str(u) == ('http://127.0.0.1/%D1%84%D0%B0%D0%B9%D0%BB.jpg?'
                       'arg=%D0%9F%D1%80%D0%B8%D0%B2%D0%B5%D1%82')
+
+
+def test_build_drop_dots():
+    u = URL.build(
+        scheme='http',
+        host='example.com',
+        path='/path/../to',
+    )
+    assert str(u) == 'http://example.com/to'
diff --git a/yarl/__init__.py b/yarl/__init__.py
@@ -10,7 +10,7 @@
 
 from .quoting import quote, unquote
 
-__version__ = '0.10.2'
+__version__ = '0.10.3'
 
 __all__ = ['URL']
 
@@ -181,13 +181,14 @@ def __init__(self, val='', *, encoded=False, strict=False):
                         user += ':' + _quote(val.password)
                     netloc = user + '@' + netloc
 
-            val = SplitResult(
-                val[0],  # scheme
-                netloc,
-                _quote(val[2], safe='+@:', protected='/+', strict=strict),
-                query=_quote(val[3], safe='=+&?/:@',
-                             protected=PROTECT_CHARS, qs=True, strict=strict),
-                fragment=_quote(val[4], safe='?/:@', strict=strict))
+            path = _quote(val[2], safe='+@:', protected='/+', strict=strict)
+            if netloc:
+                path = _normalize_path(path)
+
+            query = _quote(val[3], safe='=+&?/:@',
+                           protected=PROTECT_CHARS, qs=True, strict=strict)
+            fragment = _quote(val[4], safe='?/:@', strict=strict)
+            val = SplitResult(val[0], netloc, path, query, fragment)
 
         self._val = val
         self._cache = {}
@@ -207,11 +208,16 @@ def build(cls, *, scheme='', user='', password='', host='', port=None,
             raise ValueError(
                 "Only one of \"query\" or \"query_string\" should be passed")
 
+        netloc = cls._make_netloc(user, password, host, port)
+        path = _quote(path, safe='@:', protected='/')
+        if netloc:
+            path = _normalize_path(path)
+
         url = cls(
             SplitResult(
                 scheme,
-                cls._make_netloc(user, password, host, port),
-                _quote(path, safe='@:', protected='/'),
+                netloc,
+                path,
                 _quote(query_string),
                 fragment
             ),
@@ -290,6 +296,8 @@ def __truediv__(self, name):
             parts = path.rstrip('/').split('/')
             parts.append(name)
             new_path = '/'.join(parts)
+        if self.is_absolute():
+            new_path = _normalize_path(new_path)
         return URL(self._val._replace(path=new_path, query='', fragment=''),
                    encoded=True)
 
@@ -464,7 +472,8 @@ def query(self):
         Empty value if URL has no query part.
 
         """
-        ret = MultiDict(parse_qsl(self.raw_query_string, keep_blank_values=True))
+        ret = MultiDict(parse_qsl(self.raw_query_string,
+                                  keep_blank_values=True))
         return MultiDictProxy(ret)
 
     @property
@@ -708,6 +717,8 @@ def with_path(self, path, encoded=False):
         """Return a new URL with path replaced."""
         if not encoded:
             path = _quote(path, safe='@:', protected='/', strict=self._strict)
+            if self.is_absolute():
+                path = _normalize_path(path)
         return URL(self._val._replace(path=path), encoded=True)
 
     def with_query(self, *args, **kwargs):
@@ -830,6 +841,8 @@ def with_name(self, name):
         if '/' in name:
             raise ValueError("Slash in name is not allowed")
         name = _quote(name, safe='@:', protected='/')
+        if name in ('.', '..'):
+            raise ValueError(". and .. values are forbidden")
         parts = list(self.raw_parts)
         if self.is_absolute():
             if len(parts) == 1:
@@ -873,3 +886,31 @@ def human_repr(self):
                                       self.path,
                                       self.query_string,
                                       self.fragment))
+
+
+def _normalize_path(path):
+    # Drop '.' and '..' from path
+
+    segments = path.split('/')
+    resolved_path = []
+
+    for seg in segments:
+        if seg == '..':
+            try:
+                resolved_path.pop()
+            except IndexError:
+                # ignore any .. segments that would otherwise cause an
+                # IndexError when popped from resolved_path if
+                # resolving for rfc3986
+                pass
+        elif seg == '.':
+            continue
+        else:
+            resolved_path.append(seg)
+
+    if segments[-1] in ('.', '..'):
+        # do some post-processing here. if the last segment was a relative dir,
+        # then we need to append the trailing '/'
+        resolved_path.append('')
+
+    return '/'.join(resolved_path)