Fix #84: normalize path by handling . and ..

aio-libs · Jun 24, 2017 · c7fc56c · c7fc56c
1 parent c405598
commit c7fc56c
Show file tree

Hide file tree

Showing 2 changed files with 80 additions and 10 deletions.
diff --git a/tests/test_url.py b/tests/test_url.py
@@ -44,6 +44,11 @@ def test_origin_no_scheme():
         url.origin()
 
 
+def test_drop_dots():
+    u = URL('http://example.com/path/../to')
+    assert str(u) == 'http://example.com/to'
+
+
 def test_abs_cmp():
     assert URL('http://example.com:8888') == URL('http://example.com:8888')
     assert URL('http://example.com:8888/') == URL('http://example.com:8888/')
@@ -495,6 +500,11 @@ def test_div_with_colon_and_at():
     assert url.raw_path == '/base/path:abc@123'
 
 
+def test_div_with_dots():
+    url = URL('http://example.com/base') / '../path/./to'
+    assert url.raw_path == '/path/to'
+
+
 # comparison and hashing
 
 def test_ne_str():
@@ -915,6 +925,8 @@ def test_with_fragment_bad_type():
     with pytest.raises(TypeError):
         url.with_fragment(123)
 
+# with_name
+
 
 def test_with_name():
     url = URL('http://example.com/a/b')
@@ -980,6 +992,16 @@ def test_with_name_within_colon_and_at():
     url = URL('http://example.com/oldpath').with_name('path:abc@123')
     assert url.raw_path == '/path:abc@123'
 
+
+def test_with_name_dot():
+    with pytest.raises(ValueError):
+        URL('http://example.com').with_name('.')
+
+
+def test_with_name_double_dot():
+    with pytest.raises(ValueError):
+        URL('http://example.com').with_name('..')
+
 # is_absolute
 
 
@@ -1549,3 +1571,12 @@ def test_build_query_quoting():
     assert u == URL('http://127.0.0.1/файл.jpg?arg=Привет')
     assert str(u) == ('http://127.0.0.1/%D1%84%D0%B0%D0%B9%D0%BB.jpg?'
                       'arg=%D0%9F%D1%80%D0%B8%D0%B2%D0%B5%D1%82')
+
+
+def test_build_drop_dots():
+    u = URL.build(
+        scheme='http',
+        host='example.com',
+        path='/path/../to',
+    )
+    assert str(u) == 'http://example.com/to'
diff --git a/yarl/__init__.py b/yarl/__init__.py
@@ -181,13 +181,14 @@ def __init__(self, val='', *, encoded=False, strict=False):
                         user += ':' + _quote(val.password)
                     netloc = user + '@' + netloc
 
-            val = SplitResult(
-                val[0],  # scheme
-                netloc,
-                _quote(val[2], safe='+@:', protected='/+', strict=strict),
-                query=_quote(val[3], safe='=+&?/:@',
-                             protected=PROTECT_CHARS, qs=True, strict=strict),
-                fragment=_quote(val[4], safe='?/:@', strict=strict))
+            path = _quote(val[2], safe='+@:', protected='/+', strict=strict)
+            if netloc:
+                path = _normalize_path(path)
+
+            query = _quote(val[3], safe='=+&?/:@',
+                           protected=PROTECT_CHARS, qs=True, strict=strict)
+            fragment = _quote(val[4], safe='?/:@', strict=strict)
+            val = SplitResult(val[0], netloc, path, query, fragment)
 
         self._val = val
         self._cache = {}
@@ -207,11 +208,16 @@ def build(cls, *, scheme='', user='', password='', host='', port=None,
             raise ValueError(
                 "Only one of \"query\" or \"query_string\" should be passed")
 
+        netloc = cls._make_netloc(user, password, host, port)
+        path = _quote(path, safe='@:', protected='/')
+        if netloc:
+            path = _normalize_path(path)
+
         url = cls(
             SplitResult(
                 scheme,
-                cls._make_netloc(user, password, host, port),
-                _quote(path, safe='@:', protected='/'),
+                netloc,
+                path,
                 _quote(query_string),
                 fragment
             ),
@@ -290,6 +296,8 @@ def __truediv__(self, name):
             parts = path.rstrip('/').split('/')
             parts.append(name)
             new_path = '/'.join(parts)
+        if self.is_absolute():
+            new_path = _normalize_path(new_path)
         return URL(self._val._replace(path=new_path, query='', fragment=''),
                    encoded=True)
 
@@ -464,7 +472,8 @@ def query(self):
         Empty value if URL has no query part.
 
         """
-        ret = MultiDict(parse_qsl(self.raw_query_string, keep_blank_values=True))
+        ret = MultiDict(parse_qsl(self.raw_query_string,
+                                  keep_blank_values=True))
         return MultiDictProxy(ret)
 
     @property
@@ -830,6 +839,8 @@ def with_name(self, name):
         if '/' in name:
             raise ValueError("Slash in name is not allowed")
         name = _quote(name, safe='@:', protected='/')
+        if name in ('.', '..'):
+            raise ValueError(". and .. values are forbidden")
         parts = list(self.raw_parts)
         if self.is_absolute():
             if len(parts) == 1:
@@ -873,3 +884,31 @@ def human_repr(self):
                                       self.path,
                                       self.query_string,
                                       self.fragment))
+
+
+def _normalize_path(path):
+    # Drop '.' and '..' from path
+
+    segments = path.split('/')
+    resolved_path = []
+
+    for seg in segments:
+        if seg == '..':
+            try:
+                resolved_path.pop()
+            except IndexError:
+                # ignore any .. segments that would otherwise cause an
+                # IndexError when popped from resolved_path if
+                # resolving for rfc3986
+                pass
+        elif seg == '.':
+            continue
+        else:
+            resolved_path.append(seg)
+
+    if segments[-1] in ('.', '..'):
+        # do some post-processing here. if the last segment was a relative dir,
+        # then we need to append the trailing '/'
+        resolved_path.append('')
+
+    return '/'.join(resolved_path)