From 9d041c4c950b687df86f87842f458ad72bd7b482 Mon Sep 17 00:00:00 2001 From: Marijn Haverbeke Date: Tue, 16 Apr 2024 13:57:24 +0200 Subject: [PATCH] Allow duplicate regexp capture group names in different branches FEATURE: Support ES2025 duplicate capture group names in regular expressions. Issue https://github.com/acornjs/acorn/issues/1290 Issue https://github.com/acornjs/acorn/pull/1291 --- .eslintrc.js | 2 + acorn/src/regexp.js | 68 ++++++++++++++++++++++++++------ bin/test262.unsupported-features | 1 - test/run.js | 1 + test/tests-regexp-2025.js | 18 +++++++++ 5 files changed, 78 insertions(+), 12 deletions(-) create mode 100644 test/tests-regexp-2025.js diff --git a/.eslintrc.js b/.eslintrc.js index 31534c60a..e9ab078c5 100644 --- a/.eslintrc.js +++ b/.eslintrc.js @@ -21,6 +21,8 @@ module.exports = { ], plugins: ["eslint-plugin-import"], rules: { + "no-unreachable-loop": "off", + "no-empty": "off", curly: "off", eqeqeq: ["error", "always", {null: "ignore"}], indent: [ diff --git a/acorn/src/regexp.js b/acorn/src/regexp.js index 71babf7ae..1f6373287 100644 --- a/acorn/src/regexp.js +++ b/acorn/src/regexp.js @@ -5,6 +5,32 @@ import {hasOwn, codePointToString} from "./util.js" const pp = Parser.prototype +// Track disjunction structure to determine whether a duplicate +// capture group name is allowed because it is in a separate branch. +class BranchID { + constructor(parent, base) { + // Parent disjunction branch + this.parent = parent + // Identifies this set of sibling branches + this.base = base || this + } + + separatedFrom(alt) { + // A branch is separate from another branch if they or any of + // their parents are siblings in a given disjunction + for (let self = this; self; self = self.parent) { + for (let other = alt; other; other = other.parent) { + if (self.base === other.base && self !== other) return true + } + } + return false + } + + sibling() { + return new BranchID(this.parent, this.base) + } +} + export class RegExpValidationState { constructor(parser) { this.parser = parser @@ -22,8 +48,9 @@ export class RegExpValidationState { this.lastAssertionIsQuantifiable = false this.numCapturingParens = 0 this.maxBackReference = 0 - this.groupNames = [] + this.groupNames = Object.create(null) this.backReferenceNames = [] + this.alternative = null } reset(start, pattern, flags) { @@ -140,6 +167,11 @@ pp.validateRegExpFlags = function(state) { } } +function hasProp(obj) { + for (let _ in obj) return true + return false +} + /** * Validate the pattern part of a given RegExpLiteral. * @@ -154,7 +186,7 @@ pp.validateRegExpPattern = function(state) { // |Pattern[~U, +N]| and use this result instead. Throw a *SyntaxError* // exception if _P_ did not conform to the grammar, if any elements of _P_ // were not matched by the parse, or if any Early Error conditions exist. - if (!state.switchN && this.options.ecmaVersion >= 9 && state.groupNames.length > 0) { + if (!state.switchN && this.options.ecmaVersion >= 9 && hasProp(state.groupNames)) { state.switchN = true this.regexp_pattern(state) } @@ -168,8 +200,9 @@ pp.regexp_pattern = function(state) { state.lastAssertionIsQuantifiable = false state.numCapturingParens = 0 state.maxBackReference = 0 - state.groupNames.length = 0 + state.groupNames = Object.create(null) state.backReferenceNames.length = 0 + state.branchID = null this.regexp_disjunction(state) @@ -186,7 +219,7 @@ pp.regexp_pattern = function(state) { state.raise("Invalid escape") } for (const name of state.backReferenceNames) { - if (state.groupNames.indexOf(name) === -1) { + if (!state.groupNames[name]) { state.raise("Invalid named capture referenced") } } @@ -194,10 +227,14 @@ pp.regexp_pattern = function(state) { // https://www.ecma-international.org/ecma-262/8.0/#prod-Disjunction pp.regexp_disjunction = function(state) { + let trackDisjunction = this.options.ecmaVersion >= 16 + if (trackDisjunction) state.branchID = new BranchID(state.branchID, null, 0) this.regexp_alternative(state) while (state.eat(0x7C /* | */)) { + if (trackDisjunction) state.branchID = state.branchID.sibling() this.regexp_alternative(state) } + if (trackDisjunction) state.branchID = state.branchID.parent // Make the same message as V8. if (this.regexp_eatQuantifier(state, true)) { @@ -210,8 +247,7 @@ pp.regexp_disjunction = function(state) { // https://www.ecma-international.org/ecma-262/8.0/#prod-Alternative pp.regexp_alternative = function(state) { - while (state.pos < state.source.length && this.regexp_eatTerm(state)) - ; + while (state.pos < state.source.length && this.regexp_eatTerm(state)) {} } // https://www.ecma-international.org/ecma-262/8.0/#prod-annexB-Term @@ -447,14 +483,24 @@ pp.regexp_eatExtendedPatternCharacter = function(state) { // `?` GroupName pp.regexp_groupSpecifier = function(state) { if (state.eat(0x3F /* ? */)) { - if (this.regexp_eatGroupName(state)) { - if (state.groupNames.indexOf(state.lastStringValue) !== -1) { + if (!this.regexp_eatGroupName(state)) state.raise("Invalid group") + let trackDisjunction = this.options.ecmaVersion >= 16 + let known = state.groupNames[state.lastStringValue] + if (known) { + if (trackDisjunction) { + for (let altID of known) { + if (!altID.separatedFrom(state.branchID)) + state.raise("Duplicate capture group name") + } + } else { state.raise("Duplicate capture group name") } - state.groupNames.push(state.lastStringValue) - return } - state.raise("Invalid group") + if (trackDisjunction) { + (known || (state.groupNames[state.lastStringValue] = [])).push(state.branchID) + } else { + state.groupNames[state.lastStringValue] = true + } } } diff --git a/bin/test262.unsupported-features b/bin/test262.unsupported-features index 5ab02064c..383077264 100644 --- a/bin/test262.unsupported-features +++ b/bin/test262.unsupported-features @@ -1,3 +1,2 @@ decorators import-assertions -regexp-duplicate-named-groups diff --git a/test/run.js b/test/run.js index 05087c483..0587bc571 100644 --- a/test/run.js +++ b/test/run.js @@ -15,6 +15,7 @@ require("./tests-regexp-2020.js"); require("./tests-regexp-2022.js"); require("./tests-regexp-2024.js"); + require("./tests-regexp-2025.js"); require("./tests-json-superset.js"); require("./tests-optional-catch-binding.js"); require("./tests-bigint.js"); diff --git a/test/tests-regexp-2025.js b/test/tests-regexp-2025.js new file mode 100644 index 000000000..3929ceb51 --- /dev/null +++ b/test/tests-regexp-2025.js @@ -0,0 +1,18 @@ +if (typeof exports !== "undefined") { + var test = require("./driver.js").test + var testFail = require("./driver.js").testFail +} + +test("/(?a)|(?b)/", {}, {ecmaVersion: 2025}) +testFail("/(?a)|(?b)/", "Invalid regular expression: /(?a)|(?b)/: Duplicate capture group name (1:1)", {ecmaVersion: 2024 }) +testFail("/(?a)(?b)/", "Invalid regular expression: /(?a)(?b)/: Duplicate capture group name (1:1)", {ecmaVersion: 2025}) +test("/(?:(?a)|(?b))\\k/", {}, {ecmaVersion: 2025}) +testFail("/(?:(?a)|(?b))\\k/", "Invalid regular expression: /(?:(?a)|(?b))\\k/: Duplicate capture group name (1:1)", {ecmaVersion: 2024 }) +testFail("/(?:(?a)(?b))\\k/", "Invalid regular expression: /(?:(?a)(?b))\\k/: Duplicate capture group name (1:1)", {ecmaVersion: 2025}) +test("/(?a)(?a)|(?b)(?b)/", {}, {ecmaVersion: 2025}) +test("/(?a)|(?b)|(?c)/", {}, {ecmaVersion: 2025}) +test("/(?a)|\\k/", {}, {ecmaVersion: 2025}) +testFail("/(?a)|(?b)(?c)/", "Invalid regular expression: /(?a)|(?b)(?c)/: Duplicate capture group name (1:1)", {ecmaVersion: 2025}) +testFail("/(?:(?a)|(?b))(?c)/", "Invalid regular expression: /(?:(?a)|(?b))(?c)/: Duplicate capture group name (1:1)", {ecmaVersion: 2025}) +testFail("/(?a)(?:(?b)|(?c))/", "Invalid regular expression: /(?a)(?:(?b)|(?c))/: Duplicate capture group name (1:1)", {ecmaVersion: 2025}) +testFail("/(?:(?:(?a)|(?b))|(?:))(?c)/", "Invalid regular expression: /(?:(?:(?a)|(?b))|(?:))(?c)/: Duplicate capture group name (1:1)", {ecmaVersion: 2025})