From bd7e883fd65c7e02adf76b1cd9526c383663547f Mon Sep 17 00:00:00 2001 From: Robert Ma Date: Thu, 27 Sep 2018 16:53:09 -0400 Subject: [PATCH] Rework encode/decode of headers in Request This change changes the encoding/decoding of headers in Request: now keys are always decoded (i.e. of text type) while values have binary type (ideally we would like raw binary data, but since the Python 3 standard library always decodes headers, we actually have to re-encode them in Python 3). Documentation and comments are also improved to clarify the encoding situation. --- .../wptserve/tests/functional/test_request.py | 2 +- tools/wptserve/wptserve/ranges.py | 4 ++ tools/wptserve/wptserve/request.py | 60 ++++++++++++------- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/tools/wptserve/tests/functional/test_request.py b/tools/wptserve/tests/functional/test_request.py index 47404c968dd6cb..3063a0311810b0 100644 --- a/tools/wptserve/tests/functional/test_request.py +++ b/tools/wptserve/tests/functional/test_request.py @@ -138,7 +138,7 @@ class TestAuth(TestUsingServer): def test_auth(self): @wptserve.handlers.handler def handler(request, response): - return " ".join((request.auth.username, request.auth.password)) + return b" ".join((request.auth.username, request.auth.password)) route = ("GET", "/test/test_auth", handler) self.server.router.register(*route) diff --git a/tools/wptserve/wptserve/ranges.py b/tools/wptserve/wptserve/ranges.py index 976cb1781a0dfe..104dca2ef6f834 100644 --- a/tools/wptserve/wptserve/ranges.py +++ b/tools/wptserve/wptserve/ranges.py @@ -3,6 +3,10 @@ class RangeParser(object): def __call__(self, header, file_size): + try: + header = header.decode("ascii") + except UnicodeDecodeError: + raise HTTPException(400, "Non-ASCII range header value") prefix = "bytes=" if not header.startswith(prefix): raise HTTPException(416, message="Unrecognised range type %s" % (header,)) diff --git a/tools/wptserve/wptserve/request.py b/tools/wptserve/wptserve/request.py index 233ff151c28e48..bcd39dd5b9c9f2 100644 --- a/tools/wptserve/wptserve/request.py +++ b/tools/wptserve/wptserve/request.py @@ -308,7 +308,7 @@ def POST(self): self.raw_input.seek(0) fs = cgi.FieldStorage(fp=self.raw_input, environ={"REQUEST_METHOD": self.method}, - headers=self.headers, + headers=self.raw_headers, keep_blank_values=True) self._POST = MultiDict.from_field_storage(fs) self.raw_input.seek(pos) @@ -318,8 +318,8 @@ def POST(self): def cookies(self): if self._cookies is None: parser = BaseCookie() - cookie_headers = self.headers.get("cookie", u"") - parser.load(cookie_headers.encode("utf-8")) + cookie_headers = self.headers.get("cookie", b"") + parser.load(cookie_headers) cookies = Cookies() for key, value in parser.iteritems(): cookies[key] = CookieValue(value) @@ -355,21 +355,34 @@ def __init__(self, request_handler): super(H2Request, self).__init__(request_handler) -def maybedecode(s): - if isinstance(s, text_type): - return s +def _maybe_encode(s): + """Encodes a text-type string into binary data using iso-8859-1. + Returns `str` in Python 2 and `bytes` in Python 3. The function is a no-op + if the argument already has a binary type. + """ if isinstance(s, binary_type): - return s.decode("ascii") + return s + + # Python 3 assumes iso-8859-1 when parsing headers, which will garble text + # with non ASCII characters. We try to encode the text back to binary. + # https://github.com/python/cpython/blob/273fc220b25933e443c82af6888eb1871d032fb8/Lib/http/client.py#L213 + if isinstance(s, text_type): + return s.encode("iso-8859-1") raise TypeError("Unexpected value in RequestHeaders: %r" % s) class RequestHeaders(dict): - """Dictionary-like API for accessing request headers.""" + """Read-only dictionary-like API for accessing request headers. + + Unlike BaseHTTPRequestHandler.headers, this class always returns all + headers with the same name (separated by commas). Besides, this class + ensures all keys (i.e. names of headers) and values have binary type. + """ def __init__(self, items): for header in items.keys(): - key = header.lower() + key = _maybe_encode(header).lower() # get all headers with the same name values = items.getallmatchingheaders(header) if len(values) > 1: @@ -379,17 +392,17 @@ def __init__(self, items): for value in values: # getallmatchingheaders returns raw header lines, so # split to get name, value - multiples.append(maybedecode(value).split(':', 1)[1].strip()) + multiples.append(_maybe_encode(value).split(b':', 1)[1].strip()) headers = multiples else: - headers = [maybedecode(items[header])] - dict.__setitem__(self, maybedecode(key), headers) + headers = [_maybe_encode(items[header])] + dict.__setitem__(self, key, headers) def __getitem__(self, key): """Get all headers of a certain (case-insensitive) name. If there is more than one, the values are returned comma separated""" - key = maybedecode(key) + key = _maybe_encode(key) values = dict.__getitem__(self, key.lower()) if len(values) == 1: return values[0] @@ -415,7 +428,7 @@ def get(self, key, default=None): def get_list(self, key, default=missing): """Get all the header values for a particular field name as a list""" - key = maybedecode(key) + key = _maybe_encode(key) try: return dict.__getitem__(self, key.lower()) except KeyError: @@ -425,7 +438,7 @@ def get_list(self, key, default=missing): raise def __contains__(self, key): - key = maybedecode(key) + key = _maybe_encode(key) return dict.__contains__(self, key.lower()) def iteritems(self): @@ -604,23 +617,28 @@ class Authentication(object): The password supplied in the HTTP Authorization header, or None + + Both attributes are binary strings (`str` in Py2, `bytes` in Py3), since + RFC7617 Section 2.1 does not specify the encoding for username & passsword + (as long it's compatible with ASCII). UTF-8 should be a relatively safe + choice if callers need to decode them as most browsers use it. """ def __init__(self, headers): self.username = None self.password = None - auth_schemes = {"Basic": self.decode_basic} + auth_schemes = {b"Basic": self.decode_basic} if "authorization" in headers: header = headers.get("authorization") - assert isinstance(header, text_type) - auth_type, data = header.split(" ", 1) + assert isinstance(header, binary_type) + auth_type, data = header.split(b" ", 1) if auth_type in auth_schemes: self.username, self.password = auth_schemes[auth_type](data) else: raise HTTPException(400, "Unsupported authentication scheme %s" % auth_type) def decode_basic(self, data): - assert isinstance(data, text_type) - decoded_data = base64.decodestring(data.encode("utf-8")) - return decoded_data.decode("utf-8").split(":", 1) + assert isinstance(data, binary_type) + decoded_data = base64.b64decode(data) + return decoded_data.split(b":", 1)