Skip to content

Commit

Permalink
Move split_url cache around encoding the URL (#1369)
Browse files Browse the repository at this point in the history
  • Loading branch information
bdraco authored Oct 21, 2024
1 parent 7175383 commit a8b8d7f
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 81 deletions.
1 change: 1 addition & 0 deletions CHANGES/1369.misc.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Reworked the internal encoding cache to improve performance on cache hit -- by :user:`bdraco`.
6 changes: 3 additions & 3 deletions tests/test_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ def test_origin():
assert URL("http://example.com:8888") == url.origin()


def test_origin_is_self():
def test_origin_is_equal_to_self():
url = URL("http://example.com:8888")
assert url.origin() is url
assert url.origin() == url


def test_origin_with_no_auth():
Expand Down Expand Up @@ -1737,7 +1737,7 @@ def test_str_for_empty_url():

def test_parent_for_empty_url():
url = URL()
assert url is url.parent
assert url == url.parent


def test_parent_for_relative_url_with_child():
Expand Down
7 changes: 3 additions & 4 deletions yarl/_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import unicodedata
from functools import lru_cache
from typing import Union
from urllib.parse import SplitResult, scheme_chars, uses_netloc
from urllib.parse import scheme_chars, uses_netloc

from ._quoters import QUOTER

Expand All @@ -20,8 +20,7 @@
USES_AUTHORITY = frozenset(uses_netloc)


@lru_cache
def split_url(url: str) -> SplitResult:
def split_url(url: str) -> tuple[str, str, str, str, str]:
"""Split URL into parts."""
# Adapted from urllib.parse.urlsplit
# Only lstrip url as some applications rely on preserving trailing space.
Expand Down Expand Up @@ -79,7 +78,7 @@ def split_url(url: str) -> SplitResult:
url, _, query = url.partition("?")
if netloc and not netloc.isascii():
_check_netloc(netloc)
return tuple.__new__(SplitResult, (scheme, netloc, url, query, fragment))
return scheme, netloc, url, query, fragment


def _check_netloc(netloc: str) -> None:
Expand Down
149 changes: 75 additions & 74 deletions yarl/_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,70 @@ def rewrite_module(obj: _T) -> _T:
return obj


@lru_cache
def encode_url(url_str: str) -> tuple[SplitResult, _InternalURLCache]:
"""Parse unencoded URL."""
cache: _InternalURLCache = {}
host: Union[str, None]
scheme, netloc, path, query, fragment = split_url(url_str)
if not netloc: # netloc
host = ""
else:
if ":" in netloc or "@" in netloc or "[" in netloc:
# Complex netloc
username, password, host, port = split_netloc(netloc)
else:
username = password = port = None
host = netloc
if host is None:
if scheme in SCHEME_REQUIRES_HOST:
msg = (
"Invalid URL: host is required for "
f"absolute urls with the {scheme} scheme"
)
raise ValueError(msg)
else:
host = ""
host = _encode_host(host, validate_host=False)
# Remove brackets as host encoder adds back brackets for IPv6 addresses
cache["raw_host"] = host[1:-1] if "[" in host else host
cache["explicit_port"] = port
if password is None and username is None:
# Fast path for URLs without user, password
netloc = host if port is None else f"{host}:{port}"
cache["raw_user"] = None
cache["raw_password"] = None
else:
raw_user = REQUOTER(username) if username else username
raw_password = REQUOTER(password) if password else password
netloc = make_netloc(raw_user, raw_password, host, port)
cache["raw_user"] = raw_user
cache["raw_password"] = raw_password

if path:
path = PATH_REQUOTER(path)
if netloc:
if "." in path:
path = normalize_path(path)

query = QUERY_REQUOTER(query) if query else query
fragment = FRAGMENT_REQUOTER(fragment) if fragment else fragment
cache["scheme"] = scheme
cache["raw_query_string"] = query
cache["raw_fragment"] = fragment
# Constructing the tuple directly to avoid the overhead of
# the lambda and arg processing since NamedTuples are constructed
# with a run time built lambda
# https://github.com/python/cpython/blob/d83fcf8371f2f33c7797bc8f5423a8bca8c46e5c/Lib/collections/__init__.py#L441
return tuple.__new__(SplitResult, (scheme, netloc, path, query, fragment)), cache


@lru_cache
def pre_encoded_url(url_str: str) -> tuple[SplitResult, _InternalURLCache]:
"""Parse pre-encoded URL."""
return tuple.__new__(SplitResult, split_url(url_str)), {}


@rewrite_module
class URL:
# Don't derive from str
Expand Down Expand Up @@ -206,89 +270,26 @@ def __new__(
if strict is not None: # pragma: no cover
warnings.warn("strict parameter is ignored")
if type(val) is str:
val = split_url(val)
pass
elif type(val) is cls:
return val
elif type(val) is SplitResult:
if not encoded:
raise ValueError("Cannot apply decoding to SplitResult")
self = object.__new__(cls)
self._val = val
self._cache = {}
return self
elif isinstance(val, str):
val = split_url(str(val))
val = str(val)
else:
raise TypeError("Constructor parameter should be str")

cache: _InternalURLCache = {}
if not encoded:
host: Union[str, None]
scheme, netloc, path, query, fragment = val
orig_netloc = netloc
orig_path = path
orig_query = query
orig_fragment = fragment
if not netloc: # netloc
host = ""
else:
if ":" in netloc or "@" in netloc or "[" in netloc:
# Complex netloc
username, password, host, port = split_netloc(netloc)
else:
username = password = port = None
host = netloc
if host is None:
if scheme in SCHEME_REQUIRES_HOST:
msg = (
"Invalid URL: host is required for "
f"absolute urls with the {scheme} scheme"
)
raise ValueError(msg)
else:
host = ""
host = _encode_host(host, validate_host=False)
# Remove brackets as host encoder adds back brackets for IPv6 addresses
cache["raw_host"] = host[1:-1] if "[" in host else host
cache["explicit_port"] = port
if password is None and username is None:
# Fast path for URLs without user, password
netloc = host if port is None else f"{host}:{port}"
cache["raw_user"] = None
cache["raw_password"] = None
else:
raw_user = REQUOTER(username) if username else username
raw_password = REQUOTER(password) if password else password
netloc = make_netloc(raw_user, raw_password, host, port)
cache["raw_user"] = raw_user
cache["raw_password"] = raw_password

if path:
path = PATH_REQUOTER(path)
if netloc:
if "." in path:
path = normalize_path(path)

query = QUERY_REQUOTER(query) if query else query
fragment = FRAGMENT_REQUOTER(fragment) if fragment else fragment
cache["scheme"] = scheme
cache["raw_query_string"] = query
cache["raw_fragment"] = fragment
# There is a good chance that the SplitResult is already normalized
# so we can avoid the extra work of creating a new SplitResult
# if the input SplitResult is already normalized
if (
orig_netloc != netloc
or orig_path != path
or orig_query != query
or orig_fragment != fragment
):
# Constructing the tuple directly to avoid the overhead of
# the lambda and arg processing since NamedTuples are constructed
# with a run time built lambda
# https://github.com/python/cpython/blob/d83fcf8371f2f33c7797bc8f5423a8bca8c46e5c/Lib/collections/__init__.py#L441
val = tuple.__new__(
SplitResult, (scheme, netloc, path, query, fragment)
)

if encoded:
split_result, cache = pre_encoded_url(val)
else:
split_result, cache = encode_url(val)
self = object.__new__(cls)
self._val = val
self._val = split_result
self._cache = cache
return self

Expand Down

0 comments on commit a8b8d7f

Please sign in to comment.