diff --git a/README.md b/README.md index ba92f9c..092ba4a 100644 --- a/README.md +++ b/README.md @@ -114,15 +114,38 @@ These options can be set to `false` or `'transform'`. When using `'transform'`, // → '(.)\1' ``` - #### Miscellaneous options diff --git a/demo.js b/demo.js index 6f7b405..c200a24 100644 --- a/demo.js +++ b/demo.js @@ -5,15 +5,11 @@ const parse = require('regjsparser').parse; const generate = require('regjsgen').generate; const regenerate = require('regenerate'); -const pattern = String.raw`\w`; +const pattern = String.raw`[[a-h]&&[f-z]]`; -console.log(generate(parse(pattern))); - -const processedPattern = rewritePattern(pattern, 'ui', { - 'unicodeFlag': 'transform' +const processedPattern = rewritePattern(pattern, 'v', { + 'unicodeSetsFlag': 'transform' }); console.log(processedPattern); -// throws -new RegExp(processedPattern, 'u'); diff --git a/package.json b/package.json index 35055c2..fb25eaf 100644 --- a/package.json +++ b/package.json @@ -50,8 +50,8 @@ "dependencies": { "regenerate": "^1.4.2", "regenerate-unicode-properties": "^9.0.0", - "regjsgen": "^0.5.2", - "regjsparser": "^0.7.0", + "regjsgen": "^0.6.0", + "regjsparser": "^0.8.2", "unicode-match-property-ecmascript": "^2.0.0", "unicode-match-property-value-ecmascript": "^2.0.0" }, diff --git a/rewrite-pattern.js b/rewrite-pattern.js index a647eab..5e1952e 100644 --- a/rewrite-pattern.js +++ b/rewrite-pattern.js @@ -11,9 +11,6 @@ const ESCAPE_SETS = require('./data/character-class-escape-sets.js'); // Prepare a Regenerate set containing all code points, used for negative // character classes (if any). const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); -// Without the `u` flag, the range stops at 0xFFFF. -// https://mths.be/es6#sec-pattern-semantics -const BMP_SET = regenerate().addRange(0x0, 0xFFFF); // Prepare a Regenerate set containing all code points that are supposed to be // matched by `/./u`. https://mths.be/es6#sec-atom @@ -98,6 +95,16 @@ regenerate.prototype.iuAddRange = function(min, max) { } while (++min <= max); return $this; }; +regenerate.prototype.iuRemoveRange = function(min, max) { + const $this = this; + do { + const folded = caseFold(min); + if (folded) { + $this.remove(folded); + } + } while (++min <= max); + return $this; +}; const update = (item, pattern) => { let tree = parse(pattern, config.useUnicodeFlag ? 'u' : ''); @@ -128,42 +135,118 @@ const caseFold = (codePoint) => { return iuMappings.get(codePoint) || false; }; -const processCharacterClass = (characterClassItem, regenerateOptions) => { +const buildHandler = (action) => { + switch (action) { + case 'union': + return { + single: (set, cp) => set ? set.add(cp) : regenerate(cp), + regSet: (set, set2) => set ? set.add(set2) : set2, + range: (set, start, end) => { + if (!set) set = regenerate(); + set.addRange(start, end); + return set; + }, + iuRange: (set, start, end) => { + if (!set) set = regenerate(); + set.iuAddRange(start, end); + return set; + } + }; + case 'union-negative': + return { + single: (set, cp) => set && set.contains(cp) ? UNICODE_SET.clone() : UNICODE_SET.clone().remove(cp), + regSet: (set, set2) => UNICODE_SET.clone().remove(set2).add(set || []), + range: (set, start, end) => UNICODE_SET.clone().removeRange(start, end).add(set || []), + iuRange: (set, start, end) => UNICODE_SET.clone().iuRemoveRange(start, end).add(set || []) + }; + case 'intersection': + const regSet = (set, set2) => set ? set.intersection(set2) : set2; + return { + single: (set, cp) => !set || set.contains(cp) ? regenerate(cp) : regenerate(), + regSet: regSet, + range: (set, start, end) => regSet(set, regenerate().addRange(start, end)), + iuRange: (set, start, end) => regSet(set, regenerate().iuAddRange(start, end)) + }; + case 'subtraction': + return { + single: (set, cp) => set ? set.remove(cp) : regenerate(cp), + regSet: (set, set2) => set ? set.remove(set2) : set2, + range: (set, start, end) => set ? set.removeRange(start, end) : regenerate().addRange(start, end), + iuRange: (set, start, end) => set ? set.iuRemoveRange(start, end) : regenerate().iuAddRange(start, end) + }; + // The `default` clause is only here as a safeguard; it should never be + // reached. Code coverage tools should ignore it. + /* istanbul ignore next */ + default: + throw new Error(`Unknown set action: ${ characterClassItem.kind }`); + } +}; + +const computeCharacterClass = (characterClassItem) => { let transformed = config.transform.unicodeFlag; - const negative = characterClassItem.negative; - const set = regenerate(); + let set; + + let handlePositive; + let handleNegative; + + switch (characterClassItem.kind) { + case 'union': + handlePositive = buildHandler('union'); + handleNegative = buildHandler('union-negative'); + break; + case 'intersection': + handlePositive = buildHandler('intersection'); + handleNegative = buildHandler('subtraction'); + break; + case 'subtraction': + handlePositive = buildHandler('subtraction'); + handleNegative = buildHandler('intersection'); + break; + // The `default` clause is only here as a safeguard; it should never be + // reached. Code coverage tools should ignore it. + /* istanbul ignore next */ + default: + throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`); + } + for (const item of characterClassItem.body) { switch (item.type) { case 'value': - set.add(item.codePoint); + set = handlePositive.single(set, item.codePoint); if (config.flags.ignoreCase && config.transform.unicodeFlag) { const folded = caseFold(item.codePoint); if (folded) { - set.add(folded); + set = handlePositive.single(set, folded); } } break; case 'characterClassRange': const min = item.min.codePoint; const max = item.max.codePoint; - set.addRange(min, max); + set = handlePositive.range(set, min, max); if (config.flags.ignoreCase && config.transform.unicodeFlag) { - set.iuAddRange(min, max); + set = handlePositive.iuRange(set, min, max); } break; case 'characterClassEscape': - set.add(getCharacterClassEscapeSet( + set = handlePositive.regSet(set, getCharacterClassEscapeSet( item.value, config.flags.unicode, config.flags.ignoreCase )); break; case 'unicodePropertyEscape': - set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); + set = handlePositive.regSet(set, getUnicodePropertyEscapeSet(item.value, item.negative)); if (config.transform.unicodePropertyEscapes) { transformed = true; } break; + case 'characterClass': + const handler = item.negative ? handleNegative : handlePositive; + const res = computeCharacterClass(item); + set = handler.regSet(set, res.set); + transformed = true; + break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ @@ -171,11 +254,27 @@ const processCharacterClass = (characterClassItem, regenerateOptions) => { throw new Error(`Unknown term type: ${ item.type }`); } } + + if (!set) { // /[]/ + set = regenerate(); + } + + return { set, transformed }; +} + +const processCharacterClass = (characterClassItem, regenerateOptions) => { + const negative = characterClassItem.negative; + const { set, transformed } = computeCharacterClass(characterClassItem); if (transformed) { + const setStr = set.toString(regenerateOptions); if (negative) { - update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`) + if (config.useUnicodeFlag) { + update(characterClassItem, `[^${setStr.slice(1, -1)}]`) + } else { + update(characterClassItem, `(?!${setStr})[\\s\\S]`) + } } else { - update(characterClassItem, set.toString(regenerateOptions)); + update(characterClassItem, setStr); } } return characterClassItem; @@ -307,27 +406,22 @@ const processTerm = (item, regenerateOptions, groups) => { return item; }; -// Enable every stable RegExp feature by default -const regjsparserFeatures = { - 'unicodePropertyEscape': true, - 'namedGroups': true, - 'lookbehind': true, -}; - const config = { 'flags': { 'ignoreCase': false, 'unicode': false, + 'unicodeSets': false, 'dotAll': false, }, 'transform': { 'dotAllFlag': false, 'unicodeFlag': false, + 'unicodeSetsFlag': false, 'unicodePropertyEscapes': false, 'namedGroups': false, }, get useUnicodeFlag() { - return this.flags.unicode && !this.transform.unicodeFlag; + return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag; } }; @@ -345,6 +439,11 @@ const validateOptions = (options) => { throw new Error(`.${key} must be false (default) or 'transform'.`); } break; + case 'unicodeSetsFlag': + if (value != null && value !== false && value !== 'parse' && value !== 'transform') { + throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`); + } + break; case 'onNamedGroup': if (value != null && typeof value !== 'function') { throw new Error('.onNamedGroup must be a function.'); @@ -363,21 +462,34 @@ const rewritePattern = (pattern, flags, options) => { validateOptions(options); config.flags.unicode = hasFlag(flags, 'u'); + config.flags.unicodeSets = hasFlag(flags, 'v'); config.flags.ignoreCase = hasFlag(flags, 'i'); config.flags.dotAll = hasFlag(flags, 's'); config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag'); - config.transform.unicodeFlag = config.flags.unicode && transform(options, 'unicodeFlag'); + config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag'); + config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag'); + // unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform' config.transform.unicodePropertyEscapes = config.flags.unicode && ( transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes') ); config.transform.namedGroups = transform(options, 'namedGroups'); + const regjsparserFeatures = { + 'unicodeSet': Boolean(options && options.unicodeSetsFlag), + + // Enable every stable RegExp feature by default + 'unicodePropertyEscape': true, + 'namedGroups': true, + 'lookbehind': true, + }; + const regenerateOptions = { - 'hasUnicodeFlag': config.flags.unicode && !config.transform.unicodeFlag, + 'hasUnicodeFlag': config.useUnicodeFlag, 'bmpOnly': !config.flags.unicode }; + const groups = { 'onNamedGroup': options && options.onNamedGroup, 'lastIndex': 0, diff --git a/tests/tests.js b/tests/tests.js index b5a66eb..082131c 100644 --- a/tests/tests.js +++ b/tests/tests.js @@ -1001,3 +1001,134 @@ describe('character classes', () => { } }); +const TRANSFORM_U = { unicodeFlag: 'transform', unicodeSetsFlag: 'transform' }; + +const unicodeSetFixtures = [ + { + pattern: '[[a-h]&&[f-z]]', + expected: '[f-h]' + }, + { + pattern: '[[a-h]&&[f-z]&&[p-z]]', + expected: '[]' + }, + { + pattern: '[[a-h]&&[b]]', + expected: 'b' + }, + { + pattern: '[[a-h]&&b]', + expected: 'b' + }, + { + pattern: '[[g-z]&&b]', + expected: '[]' + }, + { + pattern: '[[a-h]&&[^f-z]]', + expected: '[a-e]' + }, + { + pattern: '[[a-h]&&[^f-z]&&[p-z]]', + expected: '[]' + }, + { + pattern: '[[a-h]&&[^f-z]&&[^p-z]]', + expected: '[a-e]' + }, + { + pattern: '[[a-h]&&[^b]]', + expected: '[ac-h]' + }, + { + pattern: '[[a-h]--[f-z]]', + expected: '[a-e]' + }, + { + pattern: '[[a-h]--[f-z]--[p-z]]', + expected: '[a-e]' + }, + { + pattern: '[[a-z]--[d-k]--[s-w]]', + expected: '[a-cl-rx-z]' + }, + { + pattern: '[[a-h]--[b]]', + expected: '[ac-h]' + }, + { + pattern: '[[b]--[a-h]]', + expected: '[]' + }, + { + pattern: '[[a-h]--b]', + expected: '[ac-h]' + }, + { + pattern: '[b--[a-h]]', + expected: '[]' + }, + { + pattern: '[[g-z]--b]', + expected: '[g-z]' + }, + { + pattern: '[b--[g-z]]', + expected: 'b' + }, + { + pattern: '[[a-h]--[^f-z]]', + expected: '[f-h]' + }, + { + pattern: '[[a-h]--[^f-z]--[p-z]]', + expected: '[f-h]' + }, + { + pattern: '[[a-h]--[^f-z]--[^p-z]]', + expected: '[]' + }, + { + pattern: '[[a-h]--[^b]]', + expected: 'b' + }, + { + pattern: '[[a-z][f-h]]', + expected: '[a-z]' + }, + { + pattern: '[^[a-z][f-h]]', + expected: '[^a-z]' + }, + { + pattern: '[^[a-z][f-h]]', + expected: '(?:(?![a-z])[\\s\\S])', + options: TRANSFORM_U + }, + { + pattern: '[[^a-z][f-h]]', + expected: '[\\0-`f-h\\{-\\u{10FFFF}]' + }, + { + pattern: '[[^a-z][f-h]]', + expected: '(?:[\\0-`f-h\\{-\\uFFFF]|[\\uD800-\\uDBFF][\\uDC00-\\uDFFF])', + options: TRANSFORM_U + }, +]; + +describe('unicodeSets (v) flag', () => { + for (const fixture of unicodeSetFixtures) { + const pattern = fixture.pattern; + const flags = fixture.flags || 'v'; + const options = fixture.options || { unicodeSetsFlag: 'transform' }; + const transformUnicodeFlag = options.unicodeFlag === 'transform'; + it('rewrites `/' + pattern + '/' + flags + '` correctly ' + (transformUnicodeFlag ? 'without ' : '') + 'using the u flag', () => { + const transpiled = rewritePattern(pattern, flags, options); + const expected = fixture.expected; + if (transpiled != '(?:' + expected + ')') { + assert.strictEqual(transpiled, expected); + } + }); + } +}); +