Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate out some jsg::Url mods from the module registry refactor #1823

Merged
merged 1 commit into from
Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions src/workerd/jsg/url-test.c++
Original file line number Diff line number Diff line change
Expand Up @@ -3145,6 +3145,47 @@ KJ_TEST("Can parse") {
}
}

KJ_TEST("Normalize path for comparison and cloning") {
// The URL parser does not percent-decode characters in the result.
// For instance, even tho `f` does not need to be percent encoded,
// the value `%66oo` will be returned as is. In some cases we want
// to be able to treat `%66oo` and `foo` as equivalent for the sake
// of comparison and cloning. This is what the NORMALIZE_PATH option
// is for. It will percent-decode the path, then re-encode it.
// Note that there is a definite performance cost to this, so it
// should only be used when necessary.

auto url1 = "file:///%66oo/boo%fe"_url;
auto url2 = "file:///foo/boo%fe"_url;
auto url3 = "file:///foo/boo%FE"_url;

auto url4 = url1.clone(Url::EquivalenceOption::NORMALIZE_PATH);

KJ_ASSERT(url1.equal(url2, Url::EquivalenceOption::NORMALIZE_PATH));
KJ_ASSERT(url2.equal(url1, Url::EquivalenceOption::NORMALIZE_PATH));
KJ_ASSERT(url3 == url4);

// This one will not be equivalent because the %2f is not decoded
auto url5 = KJ_ASSERT_NONNULL(Url::tryParse("file:///foo%2fboo%fe"_kj));

KJ_ASSERT(!url5.equal(url2, Url::EquivalenceOption::NORMALIZE_PATH));

auto url6 = url5.clone(Url::EquivalenceOption::NORMALIZE_PATH);
KJ_ASSERT(url6.getHref() == "file:///foo%2Fboo%FE"_kj);

auto url7 = "file:///foo%2Fboo%2F"_url;
url7 = url7.clone(Url::EquivalenceOption::NORMALIZE_PATH);
KJ_ASSERT(url7.getHref() == "file:///foo%2Fboo%2F"_kj);

auto url8 = "file:///foo%2F%2f/bar"_url;
url8 = url8.clone(Url::EquivalenceOption::NORMALIZE_PATH);
KJ_ASSERT(url8.getHref() == "file:///foo%2F%2F/bar"_kj);

auto url9 = "file:///foo%2f%2F/bar"_url;
url9 = url9.clone(Url::EquivalenceOption::NORMALIZE_PATH);
KJ_ASSERT(url9.getHref() == "file:///foo%2F%2F/bar"_kj);
}

} // namespace
} // namespace workerd::jsg::test

100 changes: 92 additions & 8 deletions src/workerd/jsg/url.c++
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
extern "C" {
#include "ada_c.h"
}
#include "ada.h"
#include <string>
#include <vector>

#include <kj/debug.h>
#include <kj/string-tree.h>
Expand Down Expand Up @@ -44,6 +47,57 @@ T getInner(const kj::Own<void>& inner) {
return const_cast<T>(value);
}

kj::Array<const char> normalizePathEncoding(kj::ArrayPtr<const char> pathname) {
// Sadly, this is a bit tricky because we do not want to decode %2f as a slash.
// we want to keep those as is. So we'll split the input around those bits.
// Unfortunately we need to split on either %2f or %2F, so we'll need to search
// through ourselves. This is simple enough, tho. We'll percent decode as we go,
// re-encode the pieces and then join them back together with %2F.

static constexpr auto findNext = [](std::string_view input)
-> kj::Maybe<size_t> {
size_t pos = input.find("%2", 0);
if (pos != std::string_view::npos) {
if (input[pos+2] == 'f' || input[pos+2] == 'F') {
return pos;
}
}
return kj::none;
};

std::string_view input(pathname.begin(), pathname.end());
std::vector<std::string> parts;

while (true) {
if (input.size() == 0) {
parts.push_back("");
break;
}
KJ_IF_SOME(pos, findNext(input)) {
parts.push_back(ada::unicode::percent_decode(input.substr(0, pos), 0));
input = input.substr(pos + 3);
continue;
} else {
// No more %2f or %2F found. Add input to parts
parts.push_back(ada::unicode::percent_decode(input, 0));
break;
}
}

std::string res;
bool first = true;
for (auto& part : parts) {
auto encoded = ada::unicode::percent_encode(part, ada::character_sets::PATH_PERCENT_ENCODE);
if (!first) res += "%2F";
else first = false;
res += encoded;
}

kj::Array<const char> ret = kj::heapArray<const char>(res.length());
memcpy(const_cast<char*>(ret.begin()), res.data(), res.length());
return kj::mv(ret);
}

} // namespace

Url::Url(kj::Own<void> inner) : inner(kj::mv(inner)) {}
Expand All @@ -57,13 +111,28 @@ bool Url::equal(const Url& other, EquivalenceOption option) const {
return *this == other;
}

auto otherPathname = other.getPathname();
auto thisPathname = getPathname();
kj::Array<const char> otherPathnameStore = nullptr;
kj::Array<const char> thisPathnameStore = nullptr;

if ((option & EquivalenceOption::NORMALIZE_PATH) == EquivalenceOption::NORMALIZE_PATH) {
otherPathnameStore = normalizePathEncoding(otherPathname);
otherPathname = otherPathnameStore;
thisPathnameStore = normalizePathEncoding(thisPathname);
thisPathname = thisPathnameStore;
}

// If we are ignoring fragments, we'll compare each component separately:
return other.getProtocol() == getProtocol() &&
other.getHost() == getHost() &&
other.getUsername() == getUsername() &&
other.getPassword() == getPassword() &&
other.getPathname() == getPathname() &&
other.getSearch() == getSearch();
return (other.getProtocol() == getProtocol()) &&
(other.getHost() == getHost()) &&
(other.getUsername() == getUsername()) &&
(other.getPassword() == getPassword()) &&
(otherPathname == thisPathname) &&
(((option & EquivalenceOption::IGNORE_SEARCH) == EquivalenceOption::IGNORE_SEARCH) ?
true : other.getSearch() == getSearch()) &&
(((option & EquivalenceOption::IGNORE_FRAGMENTS) == EquivalenceOption::IGNORE_FRAGMENTS) ?
true : other.getHash() == getHash());
}

bool Url::canParse(kj::StringPtr input, kj::Maybe<kj::StringPtr> base) {
Expand Down Expand Up @@ -221,11 +290,18 @@ Url::HostType Url::getHostType() const {
return static_cast<HostType>(value);
}

Url Url::clone(EquivalenceOption option) {
Url Url::clone(EquivalenceOption option) const {
ada_url copy = ada_copy(getInner<ada_url>(inner));
if (option == EquivalenceOption::IGNORE_FRAGMENTS) {
if ((option & EquivalenceOption::IGNORE_FRAGMENTS) == EquivalenceOption::IGNORE_FRAGMENTS) {
ada_clear_hash(copy);
}
if ((option & EquivalenceOption::IGNORE_SEARCH) == EquivalenceOption::IGNORE_SEARCH) {
ada_clear_search(copy);
}
if ((option & EquivalenceOption::NORMALIZE_PATH) == EquivalenceOption::NORMALIZE_PATH) {
auto normalized = normalizePathEncoding(getPathname());
ada_set_pathname(copy, normalized.begin(), normalized.size());
}
return Url(wrap(copy));
}

Expand All @@ -245,10 +321,18 @@ kj::Array<const char> Url::idnToAscii(kj::ArrayPtr<const char> value) {
AdaOwnedStringDisposer::INSTANCE);
}

kj::Maybe<Url> Url::tryResolve(kj::ArrayPtr<const char> input) const {
return tryParse(input, getHref());
}

kj::uint Url::hashCode() const {
return kj::hashCode(getHref());
}

const Url operator "" _url(const char* str, size_t size) {
return KJ_ASSERT_NONNULL(Url::tryParse(kj::ArrayPtr<const char>(str, size)));
}

// ======================================================================================

namespace {
Expand Down
28 changes: 25 additions & 3 deletions src/workerd/jsg/url.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,16 @@ class Url final {
bool operator==(const Url& other) const KJ_WARN_UNUSED_RESULT;

enum class EquivalenceOption {
DEFAULT,
IGNORE_FRAGMENTS,
DEFAULT = 0,
// When set, the fragment/hash portion of the URL will be ignored when comparing or
// cloning URLs.
IGNORE_FRAGMENTS = 1 << 0,
// When set, the search portion of the URL will be ignored when comparing or cloning URLs.
IGNORE_SEARCH = 1 << 1,
// When set, the pathname portion of the URL will be normalized by percent-decoding
// then re-encoding the pathname. This is useful when comparing URLs that may have
// different, but equivalent percent-encoded paths. e.g. %66oo and foo are equivalent.
NORMALIZE_PATH = 1 << 2,
};

bool equal(const Url& other, EquivalenceOption option = EquivalenceOption::DEFAULT) const
Expand Down Expand Up @@ -89,7 +97,10 @@ class Url final {

// Copies this Url. If the option is set of EquivalenceOption::IGNORE_FRAGMENTS, the
// copied Url will clear any fragment/hash that exists.
Url clone(EquivalenceOption option = EquivalenceOption::DEFAULT) KJ_WARN_UNUSED_RESULT;
Url clone(EquivalenceOption option = EquivalenceOption::DEFAULT) const KJ_WARN_UNUSED_RESULT;

// Resolve the input relative to this URL
kj::Maybe<Url> tryResolve(kj::ArrayPtr<const char> input) const KJ_WARN_UNUSED_RESULT;

HostType getHostType() const;
SchemeType getSchemeType() const;
Expand Down Expand Up @@ -118,6 +129,13 @@ class Url final {
kj::Own<void> inner;
};

constexpr Url::EquivalenceOption operator|(Url::EquivalenceOption a, Url::EquivalenceOption b) {
return static_cast<Url::EquivalenceOption>(static_cast<int>(a) | static_cast<int>(b));
}
constexpr Url::EquivalenceOption operator&(Url::EquivalenceOption a, Url::EquivalenceOption b) {
return static_cast<Url::EquivalenceOption>(static_cast<int>(a) & static_cast<int>(b));
}

class UrlSearchParams final {
public:
class KeyIterator final {
Expand Down Expand Up @@ -333,4 +351,8 @@ class UrlPattern final {
static Result<UrlPattern> tryCompileInit(UrlPattern::Init init, const CompileOptions& options);
};

// Append _url to a string literal to create a parsed URL. An assert will be triggered
// if the value cannot be parsed successfully.
const Url operator "" _url(const char* str, size_t size) KJ_WARN_UNUSED_RESULT;

} // namespace workerd::jsg
Loading