From 06a617aa21c434b5e607370e7ee0051d506a4793 Mon Sep 17 00:00:00 2001 From: Rajaram Gaunker Date: Thu, 11 May 2017 00:19:15 -0700 Subject: [PATCH] url: update IDNA error conditions This commit contains three separate changes: - Always return a string from ToUnicode no matter if an error occurred. - Disable CheckHyphens boolean flag. This flag will soon be enabled in the URL Standard, but is implemented manually by selectively ignoring certain errors. - Enable CheckBidi boolean flag per URL Standard update. This allows domain names with hyphens at 3 and 4th position, as well as those with leading and trailing hyphens. They are technically invalid, but seen in the wild. Tests are updated and simplified accordingly. PR-URL: https://github.com/nodejs/node/pull/12966 Fixes: https://github.com/nodejs/node/issues/12965 Refs: https://github.com/whatwg/url/pull/309 Reviewed-By: Daijiro Wachi Reviewed-By: Timothy Gu --- src/node_i18n.cc | 38 +++++++++++-------- src/node_i18n.h | 3 +- test/fixtures/url-idna.js | 33 ++++++++++------ test/parallel/test-icu-punycode.js | 22 ++++------- .../parallel/test-url-domain-ascii-unicode.js | 3 +- test/parallel/test-whatwg-url-domainto.js | 11 ++---- 6 files changed, 59 insertions(+), 51 deletions(-) diff --git a/src/node_i18n.cc b/src/node_i18n.cc index 30394f3a47b79f..98818b581b7f95 100644 --- a/src/node_i18n.cc +++ b/src/node_i18n.cc @@ -436,11 +436,9 @@ bool InitializeICUDirectory(const std::string& path) { int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, - size_t length, - bool lenient) { + size_t length) { UErrorCode status = U_ZERO_ERROR; - uint32_t options = UIDNA_DEFAULT; - options |= UIDNA_NONTRANSITIONAL_TO_UNICODE; + uint32_t options = UIDNA_NONTRANSITIONAL_TO_UNICODE; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; @@ -462,14 +460,10 @@ int32_t ToUnicode(MaybeStackBuffer* buf, &status); } - // UTS #46's ToUnicode operation applies no validation of domain name length - // (nor a flag requesting it to do so, like VerifyDnsLength for ToASCII). For - // that reason, unlike ToASCII below, ICU4C correctly accepts long domain - // names. However, ICU4C still sets the EMPTY_LABEL error in contrary to UTS - // #46. Therefore, explicitly filters out that error here. - info.errors &= ~UIDNA_ERROR_EMPTY_LABEL; + // info.errors is ignored as UTS #46 ToUnicode always produces a Unicode + // string, regardless of whether an error occurred. - if (U_FAILURE(status) || (!lenient && info.errors != 0)) { + if (U_FAILURE(status)) { len = -1; buf->SetLength(0); } else { @@ -485,8 +479,7 @@ int32_t ToASCII(MaybeStackBuffer* buf, size_t length, bool lenient) { UErrorCode status = U_ZERO_ERROR; - uint32_t options = UIDNA_DEFAULT; - options |= UIDNA_NONTRANSITIONAL_TO_ASCII; + uint32_t options = UIDNA_NONTRANSITIONAL_TO_ASCII | UIDNA_CHECK_BIDI; UIDNA* uidna = uidna_openUTS46(options, &status); if (U_FAILURE(status)) return -1; @@ -518,6 +511,21 @@ int32_t ToASCII(MaybeStackBuffer* buf, info.errors &= ~UIDNA_ERROR_LABEL_TOO_LONG; info.errors &= ~UIDNA_ERROR_DOMAIN_NAME_TOO_LONG; + // These error conditions are mandated unconditionally by UTS #46 version + // 9.0.0 (rev. 17), but were found to be incompatible with actual domain + // names in the wild. As such, in the current UTS #46 draft (rev. 18) these + // checks are made optional depending on the CheckHyphens flag, which will be + // disabled in WHATWG URL's "domain to ASCII" algorithm soon. + // Refs: + // - https://github.com/whatwg/url/issues/53 + // - https://github.com/whatwg/url/pull/309 + // - http://www.unicode.org/review/pri317/ + // - http://www.unicode.org/reports/tr46/tr46-18.html + // - https://www.icann.org/news/announcement-2000-01-07-en + info.errors &= ~UIDNA_ERROR_HYPHEN_3_4; + info.errors &= ~UIDNA_ERROR_LEADING_HYPHEN; + info.errors &= ~UIDNA_ERROR_TRAILING_HYPHEN; + if (U_FAILURE(status) || (!lenient && info.errors != 0)) { len = -1; buf->SetLength(0); @@ -534,11 +542,9 @@ static void ToUnicode(const FunctionCallbackInfo& args) { CHECK_GE(args.Length(), 1); CHECK(args[0]->IsString()); Utf8Value val(env->isolate(), args[0]); - // optional arg - bool lenient = args[1]->BooleanValue(env->context()).FromJust(); MaybeStackBuffer buf; - int32_t len = ToUnicode(&buf, *val, val.length(), lenient); + int32_t len = ToUnicode(&buf, *val, val.length()); if (len < 0) { return env->ThrowError("Cannot convert name to Unicode"); diff --git a/src/node_i18n.h b/src/node_i18n.h index 78e18fdb370c94..cc1f3e6ea53569 100644 --- a/src/node_i18n.h +++ b/src/node_i18n.h @@ -43,8 +43,7 @@ int32_t ToASCII(MaybeStackBuffer* buf, bool lenient = false); int32_t ToUnicode(MaybeStackBuffer* buf, const char* input, - size_t length, - bool lenient = false); + size_t length); } // namespace i18n } // namespace node diff --git a/test/fixtures/url-idna.js b/test/fixtures/url-idna.js index f105adeed1243b..cbfe702e9372bf 100644 --- a/test/fixtures/url-idna.js +++ b/test/fixtures/url-idna.js @@ -191,22 +191,33 @@ module.exports = { { ascii: `${`${'a'.repeat(64)}.`.repeat(4)}com`, unicode: `${`${'a'.repeat(64)}.`.repeat(4)}com` - } - ], - invalid: [ - // invalid character + }, + // URLs with hyphen + { + ascii: 'r4---sn-a5mlrn7s.gevideo.com', + unicode: 'r4---sn-a5mlrn7s.gevideo.com' + }, + { + ascii: '-sn-a5mlrn7s.gevideo.com', + unicode: '-sn-a5mlrn7s.gevideo.com' + }, { - url: '\ufffd.com', - mode: 'ascii' + ascii: 'sn-a5mlrn7s-.gevideo.com', + unicode: 'sn-a5mlrn7s-.gevideo.com' }, { - url: '\ufffd.com', - mode: 'unicode' + ascii: '-sn-a5mlrn7s-.gevideo.com', + unicode: '-sn-a5mlrn7s-.gevideo.com' }, - // invalid Punycode { - url: 'xn---abc.com', - mode: 'unicode' + ascii: '-sn--a5mlrn7s-.gevideo.com', + unicode: '-sn--a5mlrn7s-.gevideo.com' } + ], + invalid: [ + // invalid character + '\ufffd.com', + // invalid bi-directional character + 'تشادرlatin.icom.museum' ] } diff --git a/test/parallel/test-icu-punycode.js b/test/parallel/test-icu-punycode.js index 411704bb8f4477..ba2014bdc85a2f 100644 --- a/test/parallel/test-icu-punycode.js +++ b/test/parallel/test-icu-punycode.js @@ -23,19 +23,13 @@ const tests = require('../fixtures/url-idna.js'); } { - const errorRe = { - ascii: /^Error: Cannot convert name to ASCII$/, - unicode: /^Error: Cannot convert name to Unicode$/ - }; - const convertFunc = { - ascii: icu.toASCII, - unicode: icu.toUnicode - }; - - for (const [i, { url, mode }] of tests.invalid.entries()) { - assert.throws(() => convertFunc[mode](url), errorRe[mode], - `Invalid case ${i + 1}`); - assert.doesNotThrow(() => convertFunc[mode](url, true), - `Invalid case ${i + 1} in lenient mode`); + for (const [i, url] of tests.invalid.entries()) { + assert.throws(() => icu.toASCII(url), + /^Error: Cannot convert name to ASCII$/, + `ToASCII invalid case ${i + 1}`); + assert.doesNotThrow(() => icu.toASCII(url, true), + `ToASCII invalid case ${i + 1} in lenient mode`); + assert.doesNotThrow(() => icu.toUnicode(url), + `ToUnicode invalid case ${i + 1}`); } } diff --git a/test/parallel/test-url-domain-ascii-unicode.js b/test/parallel/test-url-domain-ascii-unicode.js index dcc918201bb93e..9c288cb6e29c38 100644 --- a/test/parallel/test-url-domain-ascii-unicode.js +++ b/test/parallel/test-url-domain-ascii-unicode.js @@ -8,7 +8,8 @@ const domainToASCII = url.domainToASCII; const domainToUnicode = url.domainToUnicode; const domainWithASCII = [ - ['ıídيٴ', 'xn--d-iga7ro0q9f'], + ['ıíd', 'xn--d-iga7r'], + ['يٴ', 'xn--mhb8f'], ['www.ϧƽəʐ.com', 'www.xn--cja62apfr6c.com'], ['новини.com', 'xn--b1amarcd.com'], ['名がドメイン.com', 'xn--v8jxj3d1dzdz08w.com'], diff --git a/test/parallel/test-whatwg-url-domainto.js b/test/parallel/test-whatwg-url-domainto.js index 13ff9968705b98..90d9ee4a8c4648 100644 --- a/test/parallel/test-whatwg-url-domainto.js +++ b/test/parallel/test-whatwg-url-domainto.js @@ -35,11 +35,8 @@ const tests = require('../fixtures/url-idna.js'); } { - const convertFunc = { - ascii: domainToASCII, - unicode: domainToUnicode - }; - - for (const [i, { url, mode }] of tests.invalid.entries()) - assert.strictEqual(convertFunc[mode](url), '', `Invalid case ${i + 1}`); + for (const [i, url] of tests.invalid.entries()) { + assert.strictEqual(domainToASCII(url), '', `Invalid case ${i + 1}`); + assert.strictEqual(domainToUnicode(url), '', `Invalid case ${i + 1}`); + } }