From f1c3592ec66a05835cce2cd8a8fa68ab000e7fa4 Mon Sep 17 00:00:00 2001 From: Toru Nagashima Date: Thu, 2 Apr 2020 19:57:21 +0900 Subject: [PATCH] Follow regexp validation as per tc39/ecma262#1869 --- acorn/src/regexp.js | 73 ++++++++++++++++++++------------------- bin/test262.whitelist | 10 ++++++ test/run.js | 1 + test/tests-regexp-2020.js | 21 +++++++++++ 4 files changed, 70 insertions(+), 35 deletions(-) create mode 100644 test/tests-regexp-2020.js diff --git a/acorn/src/regexp.js b/acorn/src/regexp.js index 605bce520..d0e66df83 100644 --- a/acorn/src/regexp.js +++ b/acorn/src/regexp.js @@ -40,49 +40,49 @@ export class RegExpValidationState { // If u flag is given, this returns the code point at the index (it combines a surrogate pair). // Otherwise, this returns the code unit of the index (can be a part of a surrogate pair). - at(i) { + at(i, forceU = false) { const s = this.source const l = s.length if (i >= l) { return -1 } const c = s.charCodeAt(i) - if (!this.switchU || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l) { + if (!(forceU || this.switchU) || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l) { return c } const next = s.charCodeAt(i + 1) return next >= 0xDC00 && next <= 0xDFFF ? (c << 10) + next - 0x35FDC00 : c } - nextIndex(i) { + nextIndex(i, forceU = false) { const s = this.source const l = s.length if (i >= l) { return l } let c = s.charCodeAt(i), next - if (!this.switchU || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l || + if (!(forceU || this.switchU) || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l || (next = s.charCodeAt(i + 1)) < 0xDC00 || next > 0xDFFF) { return i + 1 } return i + 2 } - current() { - return this.at(this.pos) + current(forceU = false) { + return this.at(this.pos, forceU) } - lookahead() { - return this.at(this.nextIndex(this.pos)) + lookahead(forceU = false) { + return this.at(this.nextIndex(this.pos, forceU), forceU) } - advance() { - this.pos = this.nextIndex(this.pos) + advance(forceU = false) { + this.pos = this.nextIndex(this.pos, forceU) } - eat(ch) { - if (this.current() === ch) { - this.advance() + eat(ch, forceU = false) { + if (this.current(forceU) === ch) { + this.advance(forceU) return true } return false @@ -418,9 +418,9 @@ pp.regexp_eatExtendedPatternCharacter = function(state) { return false } -// GroupSpecifier[U] :: +// GroupSpecifier :: // [empty] -// `?` GroupName[?U] +// `?` GroupName pp.regexp_groupSpecifier = function(state) { if (state.eat(0x3F /* ? */)) { if (this.regexp_eatGroupName(state)) { @@ -434,8 +434,8 @@ pp.regexp_groupSpecifier = function(state) { } } -// GroupName[U] :: -// `<` RegExpIdentifierName[?U] `>` +// GroupName :: +// `<` RegExpIdentifierName `>` // Note: this updates `state.lastStringValue` property with the eaten name. pp.regexp_eatGroupName = function(state) { state.lastStringValue = "" @@ -448,9 +448,9 @@ pp.regexp_eatGroupName = function(state) { return false } -// RegExpIdentifierName[U] :: -// RegExpIdentifierStart[?U] -// RegExpIdentifierName[?U] RegExpIdentifierPart[?U] +// RegExpIdentifierName :: +// RegExpIdentifierStart +// RegExpIdentifierName RegExpIdentifierPart // Note: this updates `state.lastStringValue` property with the eaten name. pp.regexp_eatRegExpIdentifierName = function(state) { state.lastStringValue = "" @@ -464,17 +464,18 @@ pp.regexp_eatRegExpIdentifierName = function(state) { return false } -// RegExpIdentifierStart[U] :: +// RegExpIdentifierStart :: // UnicodeIDStart // `$` // `_` -// `\` RegExpUnicodeEscapeSequence[?U] +// `\` RegExpUnicodeEscapeSequence[+U] pp.regexp_eatRegExpIdentifierStart = function(state) { const start = state.pos - let ch = state.current() - state.advance() + const forceU = this.options.ecmaVersion >= 11 + let ch = state.current(forceU) + state.advance(forceU) - if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state)) { + if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state, forceU)) { ch = state.lastIntValue } if (isRegExpIdentifierStart(ch)) { @@ -489,19 +490,20 @@ function isRegExpIdentifierStart(ch) { return isIdentifierStart(ch, true) || ch === 0x24 /* $ */ || ch === 0x5F /* _ */ } -// RegExpIdentifierPart[U] :: +// RegExpIdentifierPart :: // UnicodeIDContinue // `$` // `_` -// `\` RegExpUnicodeEscapeSequence[?U] +// `\` RegExpUnicodeEscapeSequence[+U] // // pp.regexp_eatRegExpIdentifierPart = function(state) { const start = state.pos - let ch = state.current() - state.advance() + const forceU = this.options.ecmaVersion >= 11 + let ch = state.current(forceU) + state.advance(forceU) - if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state)) { + if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state, forceU)) { ch = state.lastIntValue } if (isRegExpIdentifierPart(ch)) { @@ -571,7 +573,7 @@ pp.regexp_eatCharacterEscape = function(state) { this.regexp_eatCControlLetter(state) || this.regexp_eatZero(state) || this.regexp_eatHexEscapeSequence(state) || - this.regexp_eatRegExpUnicodeEscapeSequence(state) || + this.regexp_eatRegExpUnicodeEscapeSequence(state, false) || (!state.switchU && this.regexp_eatLegacyOctalEscapeSequence(state)) || this.regexp_eatIdentityEscape(state) ) @@ -644,13 +646,14 @@ function isControlLetter(ch) { } // https://www.ecma-international.org/ecma-262/8.0/#prod-RegExpUnicodeEscapeSequence -pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) { +pp.regexp_eatRegExpUnicodeEscapeSequence = function(state, forceU = false) { const start = state.pos + const switchU = forceU || state.switchU if (state.eat(0x75 /* u */)) { if (this.regexp_eatFixedHexDigits(state, 4)) { const lead = state.lastIntValue - if (state.switchU && lead >= 0xD800 && lead <= 0xDBFF) { + if (switchU && lead >= 0xD800 && lead <= 0xDBFF) { const leadSurrogateEnd = state.pos if (state.eat(0x5C /* \ */) && state.eat(0x75 /* u */) && this.regexp_eatFixedHexDigits(state, 4)) { const trail = state.lastIntValue @@ -665,7 +668,7 @@ pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) { return true } if ( - state.switchU && + switchU && state.eat(0x7B /* { */) && this.regexp_eatHexDigits(state) && state.eat(0x7D /* } */) && @@ -673,7 +676,7 @@ pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) { ) { return true } - if (state.switchU) { + if (switchU) { state.raise("Invalid unicode escape") } state.pos = start diff --git a/bin/test262.whitelist b/bin/test262.whitelist index e69de29bb..68ddd0f1d 100644 --- a/bin/test262.whitelist +++ b/bin/test262.whitelist @@ -0,0 +1,10 @@ +language/literals/regexp/named-groups/invalid-non-id-continue-groupspecifier.js (default) +language/literals/regexp/named-groups/invalid-non-id-continue-groupspecifier.js (strict mode) +language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-3.js (default) +language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-3.js (strict mode) +language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-6.js (default) +language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-6.js (strict mode) +language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier.js (default) +language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier.js (strict mode) +language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier-2.js (default) +language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier-2.js (strict mode) diff --git a/test/run.js b/test/run.js index b34720f7e..929360099 100644 --- a/test/run.js +++ b/test/run.js @@ -12,6 +12,7 @@ require("./tests-async-iteration.js"); require("./tests-regexp.js"); require("./tests-regexp-2018.js"); + require("./tests-regexp-2020.js"); require("./tests-json-superset.js"); require("./tests-optional-catch-binding.js"); require("./tests-bigint.js"); diff --git a/test/tests-regexp-2020.js b/test/tests-regexp-2020.js new file mode 100644 index 000000000..7caff8f2a --- /dev/null +++ b/test/tests-regexp-2020.js @@ -0,0 +1,21 @@ +if (typeof exports != "undefined") { + var test = require("./driver.js").test + var testFail = require("./driver.js").testFail +} + +// https://github.com/tc39/ecma262/pull/1869 +testFail("/(?<\\ud835\\udc9c>.)/", "Invalid regular expression: /(?<\\ud835\\udc9c>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 }) +test("/(?<\\ud835\\udc9c>.)/", {}, { ecmaVersion: 2020 }) +test("/(?<\\ud835\\udc9c>.)/u", {}, { ecmaVersion: 2019 }) +test("/(?<\\ud835\\udc9c>.)/u", {}, { ecmaVersion: 2020 }) + +testFail("/(?<\\u{1d49c}>.)/", "Invalid regular expression: /(?<\\u{1d49c}>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 }) +test("/(?<\\u{1d49c}>.)/", {}, { ecmaVersion: 2020 }) +test("/(?<\\u{1d49c}>.)/u", {}, { ecmaVersion: 2019 }) +test("/(?<\\u{1d49c}>.)/u", {}, { ecmaVersion: 2020 }) + +testFail("/(?<𝒜>.)/", "Invalid regular expression: /(?<𝒜>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 }) +test("/(?<𝒜>.)/", {}, { ecmaVersion: 2020 }) +test("/(?<𝒜>.)/u", {}, { ecmaVersion: 2019 }) +test("/(?<𝒜>.)/u", {}, { ecmaVersion: 2020 }) +