Skip to content

Commit

Permalink
Implement support for && and || in v sets
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolo-ribaudo committed Sep 9, 2021
1 parent 999bfaf commit ac2e643
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 25 deletions.
10 changes: 3 additions & 7 deletions demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@ const parse = require('regjsparser').parse;
const generate = require('regjsgen').generate;
const regenerate = require('regenerate');

const pattern = String.raw`\w`;
const pattern = String.raw`[[a-h]&&[f-z]]`;

console.log(generate(parse(pattern)));

const processedPattern = rewritePattern(pattern, 'ui', {
'unicodeFlag': 'transform'
const processedPattern = rewritePattern(pattern, 'v', {
'unicodeSetFlag': 'transform'
});

console.log(processedPattern);

// throws
new RegExp(processedPattern, 'u');
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"regenerate": "^1.4.2",
"regenerate-unicode-properties": "^8.2.0",
"regjsgen": "^0.5.2",
"regjsparser": "^0.6.4",
"regjsparser": "^0.7.0",
"unicode-match-property-ecmascript": "^1.0.4",
"unicode-match-property-value-ecmascript": "^1.2.0"
},
Expand Down
141 changes: 124 additions & 17 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
// Prepare a Regenerate set containing all code points, used for negative
// character classes (if any).
const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
// Without the `u` flag, the range stops at 0xFFFF.
// https://mths.be/es6#sec-pattern-semantics
const BMP_SET = regenerate().addRange(0x0, 0xFFFF);

// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./u`. https://mths.be/es6#sec-atom
Expand Down Expand Up @@ -98,6 +95,16 @@ regenerate.prototype.iuAddRange = function(min, max) {
} while (++min <= max);
return $this;
};
regenerate.prototype.iuRemoveRange = function(min, max) {
const $this = this;
do {
const folded = caseFold(min);
if (folded) {
$this.remove(folded);
}
} while (++min <= max);
return $this;
};

const update = (item, pattern) => {
let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
Expand Down Expand Up @@ -128,54 +135,146 @@ const caseFold = (codePoint) => {
return iuMappings.get(codePoint) || false;
};

const processCharacterClass = (characterClassItem, regenerateOptions) => {
const buildHandler = (action) => {
switch (action) {
case 'union':
return {
single: (set, cp) => set ? set.add(cp) : regenerate(cp),
regSet: (set, set2) => set ? set.add(set2) : set2,
range: (set, start, end) => {
if (!set) set = regenerate();
set.addRange(start, end);
return set;
},
iuRange: (set, start, end) => {
if (!set) set = regenerate();
set.iuAddRange(start, end);
return set;
}
};
case 'union-negative':
return {
single: (set, cp) => set && set.contains(cp) ? UNICODE_SET.clone() : UNICODE_SET.clone().remove(cp),
regSet: (set, set2) => UNICODE_SET.clone().remove(set2).add(set || []),
range: (set, start, end) => UNICODE_SET.clone().removeRange(start, end).add(set || []),
iuRange: (set, start, end) => UNICODE_SET.clone().iuRemoveRange(start, end).add(set || [])
};
case 'intersection':
const regSet = (set, set2) => set ? set.intersection(set2) : set2;
return {
single: (set, cp) => !set || set.contains(cp) ? regenerate(cp) : regenerate(),
regSet: regSet,
range: (set, start, end) => regSet(set, regenerate().addRange(start, end)),
iuRange: (set, start, end) => regSet(set, regenerate().iuAddRange(start, end))
};
case 'subtraction':
return {
single: (set, cp) => set ? set.remove(cp) : regenerate(cp),
regSet: (set, set2) => set ? set.remove(set2) : set2,
range: (set, start, end) => set ? set.removeRange(start, end) : regenerate().addRange(start, end),
iuRange: (set, start, end) => set ? set.iuRemoveRange(start, end) : regenerate().iuAddRange(start, end)
};
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
}
};

const computeCharacterClass = (characterClassItem) => {
let transformed = config.transform.unicodeFlag;
const negative = characterClassItem.negative;
const set = regenerate();
let set;

let handlePositive;
let handleNegative;

switch (characterClassItem.kind) {
case 'union':
handlePositive = buildHandler('union');
handleNegative = buildHandler('union-negative');
break;
case 'intersection':
handlePositive = buildHandler('intersection');
handleNegative = buildHandler('subtraction');
break;
case 'subtraction':
handlePositive = buildHandler('subtraction');
handleNegative = buildHandler('intersection');
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}

for (const item of characterClassItem.body) {
switch (item.type) {
case 'value':
set.add(item.codePoint);
set = handlePositive.single(set, item.codePoint);
if (config.flags.ignoreCase && config.transform.unicodeFlag) {
const folded = caseFold(item.codePoint);
if (folded) {
set.add(folded);
set = handlePositive.single(set, folded);
}
}
break;
case 'characterClassRange':
const min = item.min.codePoint;
const max = item.max.codePoint;
set.addRange(min, max);
set = handlePositive.range(set, min, max);
if (config.flags.ignoreCase && config.transform.unicodeFlag) {
set.iuAddRange(min, max);
set = handlePositive.iuRange(set, min, max);
}
break;
case 'characterClassEscape':
set.add(getCharacterClassEscapeSet(
set = handlePositive.regSet(set, getCharacterClassEscapeSet(
item.value,
config.flags.unicode,
config.flags.ignoreCase
));
break;
case 'unicodePropertyEscape':
set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
set = handlePositive.regSet(set, getUnicodePropertyEscapeSet(item.value, item.negative));
if (config.transform.unicodePropertyEscapes) {
transformed = true;
}
break;
case 'characterClass':
const handler = item.negative ? handleNegative : handlePositive;
const res = computeCharacterClass(item);
set = handler.regSet(set, res.set);
transformed = true;
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
}

if (!set) { // /[]/
set = regenerate();
}

return { set, transformed };
}

const processCharacterClass = (characterClassItem, regenerateOptions) => {
const negative = characterClassItem.negative;
const { set, transformed } = computeCharacterClass(characterClassItem);
if (transformed) {
const setStr = set.toString(regenerateOptions);
if (negative) {
update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
if (config.useUnicodeFlag) {
update(characterClassItem, `[^${setStr.slice(1, -1)}]`)
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`)
}
} else {
update(characterClassItem, set.toString(regenerateOptions));
update(characterClassItem, setStr);
}
}
return characterClassItem;
Expand Down Expand Up @@ -312,22 +411,25 @@ const regjsparserFeatures = {
'unicodePropertyEscape': true,
'namedGroups': true,
'lookbehind': true,
'unicodeSet': true,
};

const config = {
'flags': {
'ignoreCase': false,
'unicode': false,
'unicodeSet': false,
'dotAll': false,
},
'transform': {
'dotAllFlag': false,
'unicodeFlag': false,
'unicodeSetFlag': false,
'unicodePropertyEscapes': false,
'namedGroups': false,
},
get useUnicodeFlag() {
return this.flags.unicode && !this.transform.unicodeFlag;
return (this.flags.unicode || this.flags.unicodeSet) && !this.transform.unicodeFlag;
}
};

Expand All @@ -339,6 +441,7 @@ const validateOptions = (options) => {
switch (key) {
case 'dotAllFlag':
case 'unicodeFlag':
case 'unicodeSetFlag':
case 'unicodePropertyEscapes':
case 'namedGroups':
if (value != null && value !== false && value !== 'transform') {
Expand All @@ -363,21 +466,25 @@ const rewritePattern = (pattern, flags, options) => {
validateOptions(options);

config.flags.unicode = hasFlag(flags, 'u');
config.flags.unicodeSet = hasFlag(flags, 'v');
config.flags.ignoreCase = hasFlag(flags, 'i');
config.flags.dotAll = hasFlag(flags, 's');

config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
config.transform.unicodeFlag = config.flags.unicode && transform(options, 'unicodeFlag');
config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSet) && transform(options, 'unicodeFlag');
config.transform.unicodeSetFlag = config.flags.unicodeSet && transform(options, 'unicodeSetFlag');

// unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
config.transform.unicodePropertyEscapes = config.flags.unicode && (
transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
);
config.transform.namedGroups = transform(options, 'namedGroups');

const regenerateOptions = {
'hasUnicodeFlag': config.flags.unicode && !config.transform.unicodeFlag,
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.flags.unicode
};

const groups = {
'onNamedGroup': options && options.onNamedGroup,
'lastIndex': 0,
Expand Down
Loading

0 comments on commit ac2e643

Please sign in to comment.