From f1c3592ec66a05835cce2cd8a8fa68ab000e7fa4 Mon Sep 17 00:00:00 2001
From: Toru Nagashima <public@mysticatea.dev>
Date: Thu, 2 Apr 2020 19:57:21 +0900
Subject: [PATCH] Follow regexp validation as per tc39/ecma262#1869

---
 acorn/src/regexp.js       | 73 ++++++++++++++++++++-------------------
 bin/test262.whitelist     | 10 ++++++
 test/run.js               |  1 +
 test/tests-regexp-2020.js | 21 +++++++++++
 4 files changed, 70 insertions(+), 35 deletions(-)
 create mode 100644 test/tests-regexp-2020.js

diff --git a/acorn/src/regexp.js b/acorn/src/regexp.js
index 605bce520..d0e66df83 100644
--- a/acorn/src/regexp.js
+++ b/acorn/src/regexp.js
@@ -40,49 +40,49 @@ export class RegExpValidationState {
 
   // If u flag is given, this returns the code point at the index (it combines a surrogate pair).
   // Otherwise, this returns the code unit of the index (can be a part of a surrogate pair).
-  at(i) {
+  at(i, forceU = false) {
     const s = this.source
     const l = s.length
     if (i >= l) {
       return -1
     }
     const c = s.charCodeAt(i)
-    if (!this.switchU || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l) {
+    if (!(forceU || this.switchU) || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l) {
       return c
     }
     const next = s.charCodeAt(i + 1)
     return next >= 0xDC00 && next <= 0xDFFF ? (c << 10) + next - 0x35FDC00 : c
   }
 
-  nextIndex(i) {
+  nextIndex(i, forceU = false) {
     const s = this.source
     const l = s.length
     if (i >= l) {
       return l
     }
     let c = s.charCodeAt(i), next
-    if (!this.switchU || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l ||
+    if (!(forceU || this.switchU) || c <= 0xD7FF || c >= 0xE000 || i + 1 >= l ||
         (next = s.charCodeAt(i + 1)) < 0xDC00 || next > 0xDFFF) {
       return i + 1
     }
     return i + 2
   }
 
-  current() {
-    return this.at(this.pos)
+  current(forceU = false) {
+    return this.at(this.pos, forceU)
   }
 
-  lookahead() {
-    return this.at(this.nextIndex(this.pos))
+  lookahead(forceU = false) {
+    return this.at(this.nextIndex(this.pos, forceU), forceU)
   }
 
-  advance() {
-    this.pos = this.nextIndex(this.pos)
+  advance(forceU = false) {
+    this.pos = this.nextIndex(this.pos, forceU)
   }
 
-  eat(ch) {
-    if (this.current() === ch) {
-      this.advance()
+  eat(ch, forceU = false) {
+    if (this.current(forceU) === ch) {
+      this.advance(forceU)
       return true
     }
     return false
@@ -418,9 +418,9 @@ pp.regexp_eatExtendedPatternCharacter = function(state) {
   return false
 }
 
-// GroupSpecifier[U] ::
+// GroupSpecifier ::
 //   [empty]
-//   `?` GroupName[?U]
+//   `?` GroupName
 pp.regexp_groupSpecifier = function(state) {
   if (state.eat(0x3F /* ? */)) {
     if (this.regexp_eatGroupName(state)) {
@@ -434,8 +434,8 @@ pp.regexp_groupSpecifier = function(state) {
   }
 }
 
-// GroupName[U] ::
-//   `<` RegExpIdentifierName[?U] `>`
+// GroupName ::
+//   `<` RegExpIdentifierName `>`
 // Note: this updates `state.lastStringValue` property with the eaten name.
 pp.regexp_eatGroupName = function(state) {
   state.lastStringValue = ""
@@ -448,9 +448,9 @@ pp.regexp_eatGroupName = function(state) {
   return false
 }
 
-// RegExpIdentifierName[U] ::
-//   RegExpIdentifierStart[?U]
-//   RegExpIdentifierName[?U] RegExpIdentifierPart[?U]
+// RegExpIdentifierName ::
+//   RegExpIdentifierStart
+//   RegExpIdentifierName RegExpIdentifierPart
 // Note: this updates `state.lastStringValue` property with the eaten name.
 pp.regexp_eatRegExpIdentifierName = function(state) {
   state.lastStringValue = ""
@@ -464,17 +464,18 @@ pp.regexp_eatRegExpIdentifierName = function(state) {
   return false
 }
 
-// RegExpIdentifierStart[U] ::
+// RegExpIdentifierStart ::
 //   UnicodeIDStart
 //   `$`
 //   `_`
-//   `\` RegExpUnicodeEscapeSequence[?U]
+//   `\` RegExpUnicodeEscapeSequence[+U]
 pp.regexp_eatRegExpIdentifierStart = function(state) {
   const start = state.pos
-  let ch = state.current()
-  state.advance()
+  const forceU = this.options.ecmaVersion >= 11
+  let ch = state.current(forceU)
+  state.advance(forceU)
 
-  if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state)) {
+  if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state, forceU)) {
     ch = state.lastIntValue
   }
   if (isRegExpIdentifierStart(ch)) {
@@ -489,19 +490,20 @@ function isRegExpIdentifierStart(ch) {
   return isIdentifierStart(ch, true) || ch === 0x24 /* $ */ || ch === 0x5F /* _ */
 }
 
-// RegExpIdentifierPart[U] ::
+// RegExpIdentifierPart ::
 //   UnicodeIDContinue
 //   `$`
 //   `_`
-//   `\` RegExpUnicodeEscapeSequence[?U]
+//   `\` RegExpUnicodeEscapeSequence[+U]
 //   <ZWNJ>
 //   <ZWJ>
 pp.regexp_eatRegExpIdentifierPart = function(state) {
   const start = state.pos
-  let ch = state.current()
-  state.advance()
+  const forceU = this.options.ecmaVersion >= 11
+  let ch = state.current(forceU)
+  state.advance(forceU)
 
-  if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state)) {
+  if (ch === 0x5C /* \ */ && this.regexp_eatRegExpUnicodeEscapeSequence(state, forceU)) {
     ch = state.lastIntValue
   }
   if (isRegExpIdentifierPart(ch)) {
@@ -571,7 +573,7 @@ pp.regexp_eatCharacterEscape = function(state) {
     this.regexp_eatCControlLetter(state) ||
     this.regexp_eatZero(state) ||
     this.regexp_eatHexEscapeSequence(state) ||
-    this.regexp_eatRegExpUnicodeEscapeSequence(state) ||
+    this.regexp_eatRegExpUnicodeEscapeSequence(state, false) ||
     (!state.switchU && this.regexp_eatLegacyOctalEscapeSequence(state)) ||
     this.regexp_eatIdentityEscape(state)
   )
@@ -644,13 +646,14 @@ function isControlLetter(ch) {
 }
 
 // https://www.ecma-international.org/ecma-262/8.0/#prod-RegExpUnicodeEscapeSequence
-pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) {
+pp.regexp_eatRegExpUnicodeEscapeSequence = function(state, forceU = false) {
   const start = state.pos
+  const switchU = forceU || state.switchU
 
   if (state.eat(0x75 /* u */)) {
     if (this.regexp_eatFixedHexDigits(state, 4)) {
       const lead = state.lastIntValue
-      if (state.switchU && lead >= 0xD800 && lead <= 0xDBFF) {
+      if (switchU && lead >= 0xD800 && lead <= 0xDBFF) {
         const leadSurrogateEnd = state.pos
         if (state.eat(0x5C /* \ */) && state.eat(0x75 /* u */) && this.regexp_eatFixedHexDigits(state, 4)) {
           const trail = state.lastIntValue
@@ -665,7 +668,7 @@ pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) {
       return true
     }
     if (
-      state.switchU &&
+      switchU &&
       state.eat(0x7B /* { */) &&
       this.regexp_eatHexDigits(state) &&
       state.eat(0x7D /* } */) &&
@@ -673,7 +676,7 @@ pp.regexp_eatRegExpUnicodeEscapeSequence = function(state) {
     ) {
       return true
     }
-    if (state.switchU) {
+    if (switchU) {
       state.raise("Invalid unicode escape")
     }
     state.pos = start
diff --git a/bin/test262.whitelist b/bin/test262.whitelist
index e69de29bb..68ddd0f1d 100644
--- a/bin/test262.whitelist
+++ b/bin/test262.whitelist
@@ -0,0 +1,10 @@
+language/literals/regexp/named-groups/invalid-non-id-continue-groupspecifier.js (default)
+language/literals/regexp/named-groups/invalid-non-id-continue-groupspecifier.js (strict mode)
+language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-3.js (default)
+language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-3.js (strict mode)
+language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-6.js (default)
+language/literals/regexp/named-groups/invalid-non-id-start-groupspecifier-6.js (strict mode)
+language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier.js (default)
+language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier.js (strict mode)
+language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier-2.js (default)
+language/literals/regexp/named-groups/invalid-u-escape-in-groupspecifier-2.js (strict mode)
diff --git a/test/run.js b/test/run.js
index b34720f7e..929360099 100644
--- a/test/run.js
+++ b/test/run.js
@@ -12,6 +12,7 @@
   require("./tests-async-iteration.js");
   require("./tests-regexp.js");
   require("./tests-regexp-2018.js");
+  require("./tests-regexp-2020.js");
   require("./tests-json-superset.js");
   require("./tests-optional-catch-binding.js");
   require("./tests-bigint.js");
diff --git a/test/tests-regexp-2020.js b/test/tests-regexp-2020.js
new file mode 100644
index 000000000..7caff8f2a
--- /dev/null
+++ b/test/tests-regexp-2020.js
@@ -0,0 +1,21 @@
+if (typeof exports != "undefined") {
+  var test = require("./driver.js").test
+  var testFail = require("./driver.js").testFail
+}
+
+// https://github.com/tc39/ecma262/pull/1869
+testFail("/(?<\\ud835\\udc9c>.)/", "Invalid regular expression: /(?<\\ud835\\udc9c>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 })
+test("/(?<\\ud835\\udc9c>.)/", {}, { ecmaVersion: 2020 })
+test("/(?<\\ud835\\udc9c>.)/u", {}, { ecmaVersion: 2019 })
+test("/(?<\\ud835\\udc9c>.)/u", {}, { ecmaVersion: 2020 })
+
+testFail("/(?<\\u{1d49c}>.)/", "Invalid regular expression: /(?<\\u{1d49c}>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 })
+test("/(?<\\u{1d49c}>.)/", {}, { ecmaVersion: 2020 })
+test("/(?<\\u{1d49c}>.)/u", {}, { ecmaVersion: 2019 })
+test("/(?<\\u{1d49c}>.)/u", {}, { ecmaVersion: 2020 })
+
+testFail("/(?<𝒜>.)/", "Invalid regular expression: /(?<𝒜>.)/: Invalid capture group name (1:1)", { ecmaVersion: 2019 })
+test("/(?<𝒜>.)/", {}, { ecmaVersion: 2020 })
+test("/(?<𝒜>.)/u", {}, { ecmaVersion: 2019 })
+test("/(?<𝒜>.)/u", {}, { ecmaVersion: 2020 })
+