Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement support for && and || in v sets #52

Merged
merged 4 commits into from
Dec 28, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,15 +114,38 @@ These options can be set to `false` or `'transform'`. When using `'transform'`,
// → '(.)\1'
```

<!--

#### Experimental regular expression features

These options can be set to `false`, `'parse'` and `'transform'`. When using `'transform'`, the corresponding features are compiled to older syntax that can run in older browsers. When using `'parse'`, they are parsed and left as-is in the output pattern. When using `false` (the default), they result in a syntax error if used.

NOTE: Currently regexpu doesn't support any ECMAScript proposal
Once these features become stable (when the proposals are accepted as part of ECMAScript), they will be parsed by default and thus `'parse'` will behave like `false`.

- `unicodeSetsFlag` - [The `v` (`unicodeSets`) flag](https://github.com/tc39/proposal-regexp-set-notation)

```js
rewritePattern('[\\p{Emoji}&&\\p{ASCII}]', 'u', {
unicodeSetsFlag: 'transform'
});
// → '[#\*0-9]'
```

By default, patterns with the `v` flag are transformed to patterns with the `u` flag. If you want to downlevel them more you can set the `unicodeFlag: 'transform'` option.

```js
rewritePattern('[^[a-h]&&[f-z]]', 'v', {
unicodeSetsFlag: 'transform'
});
// → '[^f-h]' (to be used with /u)
```

```js
rewritePattern('[^[a-h]&&[f-z]]', 'v', {
unicodeSetsFlag: 'transform',
unicodeFlag: 'transform'
});
// → '(?:(?![f-h])[\s\S])' (to be used without /u)
```

-->

#### Miscellaneous options

Expand Down
10 changes: 3 additions & 7 deletions demo.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,11 @@ const parse = require('regjsparser').parse;
const generate = require('regjsgen').generate;
const regenerate = require('regenerate');

const pattern = String.raw`\w`;
const pattern = String.raw`[[a-h]&&[f-z]]`;

console.log(generate(parse(pattern)));

const processedPattern = rewritePattern(pattern, 'ui', {
'unicodeFlag': 'transform'
const processedPattern = rewritePattern(pattern, 'v', {
'unicodeSetsFlag': 'transform'
});

console.log(processedPattern);

// throws
new RegExp(processedPattern, 'u');
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@
"dependencies": {
"regenerate": "^1.4.2",
"regenerate-unicode-properties": "^9.0.0",
"regjsgen": "^0.5.2",
"regjsparser": "^0.7.0",
"regjsgen": "^0.6.0",
"regjsparser": "^0.8.2",
"unicode-match-property-ecmascript": "^2.0.0",
"unicode-match-property-value-ecmascript": "^2.0.0"
},
Expand Down
160 changes: 136 additions & 24 deletions rewrite-pattern.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,6 @@ const ESCAPE_SETS = require('./data/character-class-escape-sets.js');
// Prepare a Regenerate set containing all code points, used for negative
// character classes (if any).
const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF);
// Without the `u` flag, the range stops at 0xFFFF.
// https://mths.be/es6#sec-pattern-semantics
const BMP_SET = regenerate().addRange(0x0, 0xFFFF);

// Prepare a Regenerate set containing all code points that are supposed to be
// matched by `/./u`. https://mths.be/es6#sec-atom
Expand Down Expand Up @@ -98,6 +95,16 @@ regenerate.prototype.iuAddRange = function(min, max) {
} while (++min <= max);
return $this;
};
regenerate.prototype.iuRemoveRange = function(min, max) {
const $this = this;
do {
const folded = caseFold(min);
if (folded) {
$this.remove(folded);
}
} while (++min <= max);
return $this;
};

const update = (item, pattern) => {
let tree = parse(pattern, config.useUnicodeFlag ? 'u' : '');
Expand Down Expand Up @@ -128,54 +135,146 @@ const caseFold = (codePoint) => {
return iuMappings.get(codePoint) || false;
};

const processCharacterClass = (characterClassItem, regenerateOptions) => {
const buildHandler = (action) => {
switch (action) {
case 'union':
return {
single: (set, cp) => set ? set.add(cp) : regenerate(cp),
regSet: (set, set2) => set ? set.add(set2) : set2,
range: (set, start, end) => {
if (!set) set = regenerate();
set.addRange(start, end);
return set;
},
iuRange: (set, start, end) => {
if (!set) set = regenerate();
set.iuAddRange(start, end);
return set;
}
};
case 'union-negative':
return {
single: (set, cp) => set && set.contains(cp) ? UNICODE_SET.clone() : UNICODE_SET.clone().remove(cp),
regSet: (set, set2) => UNICODE_SET.clone().remove(set2).add(set || []),
range: (set, start, end) => UNICODE_SET.clone().removeRange(start, end).add(set || []),
iuRange: (set, start, end) => UNICODE_SET.clone().iuRemoveRange(start, end).add(set || [])
};
case 'intersection':
const regSet = (set, set2) => set ? set.intersection(set2) : set2;
return {
single: (set, cp) => !set || set.contains(cp) ? regenerate(cp) : regenerate(),
regSet: regSet,
range: (set, start, end) => regSet(set, regenerate().addRange(start, end)),
iuRange: (set, start, end) => regSet(set, regenerate().iuAddRange(start, end))
};
case 'subtraction':
return {
single: (set, cp) => set ? set.remove(cp) : regenerate(cp),
regSet: (set, set2) => set ? set.remove(set2) : set2,
range: (set, start, end) => set ? set.removeRange(start, end) : regenerate().addRange(start, end),
iuRange: (set, start, end) => set ? set.iuRemoveRange(start, end) : regenerate().iuAddRange(start, end)
};
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown set action: ${ characterClassItem.kind }`);
}
};

const computeCharacterClass = (characterClassItem) => {
let transformed = config.transform.unicodeFlag;
const negative = characterClassItem.negative;
const set = regenerate();
let set;

let handlePositive;
let handleNegative;

switch (characterClassItem.kind) {
case 'union':
handlePositive = buildHandler('union');
handleNegative = buildHandler('union-negative');
break;
case 'intersection':
handlePositive = buildHandler('intersection');
handleNegative = buildHandler('subtraction');
break;
case 'subtraction':
handlePositive = buildHandler('subtraction');
handleNegative = buildHandler('intersection');
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown character class kind: ${ characterClassItem.kind }`);
}

for (const item of characterClassItem.body) {
switch (item.type) {
case 'value':
set.add(item.codePoint);
set = handlePositive.single(set, item.codePoint);
if (config.flags.ignoreCase && config.transform.unicodeFlag) {
const folded = caseFold(item.codePoint);
if (folded) {
set.add(folded);
set = handlePositive.single(set, folded);
}
}
break;
case 'characterClassRange':
const min = item.min.codePoint;
const max = item.max.codePoint;
set.addRange(min, max);
set = handlePositive.range(set, min, max);
if (config.flags.ignoreCase && config.transform.unicodeFlag) {
set.iuAddRange(min, max);
set = handlePositive.iuRange(set, min, max);
}
break;
case 'characterClassEscape':
set.add(getCharacterClassEscapeSet(
set = handlePositive.regSet(set, getCharacterClassEscapeSet(
item.value,
config.flags.unicode,
config.flags.ignoreCase
));
break;
case 'unicodePropertyEscape':
set.add(getUnicodePropertyEscapeSet(item.value, item.negative));
set = handlePositive.regSet(set, getUnicodePropertyEscapeSet(item.value, item.negative));
if (config.transform.unicodePropertyEscapes) {
transformed = true;
}
break;
case 'characterClass':
const handler = item.negative ? handleNegative : handlePositive;
const res = computeCharacterClass(item);
set = handler.regSet(set, res.set);
transformed = true;
break;
// The `default` clause is only here as a safeguard; it should never be
// reached. Code coverage tools should ignore it.
/* istanbul ignore next */
default:
throw new Error(`Unknown term type: ${ item.type }`);
}
}

if (!set) { // /[]/
set = regenerate();
}

return { set, transformed };
}

const processCharacterClass = (characterClassItem, regenerateOptions) => {
const negative = characterClassItem.negative;
const { set, transformed } = computeCharacterClass(characterClassItem);
if (transformed) {
const setStr = set.toString(regenerateOptions);
if (negative) {
update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`)
if (config.useUnicodeFlag) {
update(characterClassItem, `[^${setStr.slice(1, -1)}]`)
} else {
update(characterClassItem, `(?!${setStr})[\\s\\S]`)
}
} else {
update(characterClassItem, set.toString(regenerateOptions));
update(characterClassItem, setStr);
}
}
return characterClassItem;
Expand Down Expand Up @@ -307,27 +406,22 @@ const processTerm = (item, regenerateOptions, groups) => {
return item;
};

// Enable every stable RegExp feature by default
const regjsparserFeatures = {
'unicodePropertyEscape': true,
'namedGroups': true,
'lookbehind': true,
};

const config = {
'flags': {
'ignoreCase': false,
'unicode': false,
'unicodeSets': false,
'dotAll': false,
},
'transform': {
'dotAllFlag': false,
'unicodeFlag': false,
'unicodeSetsFlag': false,
'unicodePropertyEscapes': false,
'namedGroups': false,
},
get useUnicodeFlag() {
return this.flags.unicode && !this.transform.unicodeFlag;
return (this.flags.unicode || this.flags.unicodeSets) && !this.transform.unicodeFlag;
}
};

Expand All @@ -345,6 +439,11 @@ const validateOptions = (options) => {
throw new Error(`.${key} must be false (default) or 'transform'.`);
}
break;
case 'unicodeSetsFlag':
if (value != null && value !== false && value !== 'parse' && value !== 'transform') {
throw new Error(`.${key} must be false (default), 'parse' or 'transform'.`);
}
break;
case 'onNamedGroup':
if (value != null && typeof value !== 'function') {
throw new Error('.onNamedGroup must be a function.');
Expand All @@ -363,21 +462,34 @@ const rewritePattern = (pattern, flags, options) => {
validateOptions(options);

config.flags.unicode = hasFlag(flags, 'u');
config.flags.unicodeSets = hasFlag(flags, 'v');
config.flags.ignoreCase = hasFlag(flags, 'i');
config.flags.dotAll = hasFlag(flags, 's');

config.transform.dotAllFlag = config.flags.dotAll && transform(options, 'dotAllFlag');
config.transform.unicodeFlag = config.flags.unicode && transform(options, 'unicodeFlag');
config.transform.unicodeFlag = (config.flags.unicode || config.flags.unicodeSets) && transform(options, 'unicodeFlag');
config.transform.unicodeSetsFlag = config.flags.unicodeSets && transform(options, 'unicodeSetsFlag');

// unicodeFlag: 'transform' implies unicodePropertyEscapes: 'transform'
config.transform.unicodePropertyEscapes = config.flags.unicode && (
transform(options, 'unicodeFlag') || transform(options, 'unicodePropertyEscapes')
);
config.transform.namedGroups = transform(options, 'namedGroups');

const regjsparserFeatures = {
'unicodeSet': Boolean(options && options.unicodeSetsFlag),

// Enable every stable RegExp feature by default
'unicodePropertyEscape': true,
'namedGroups': true,
'lookbehind': true,
};

const regenerateOptions = {
'hasUnicodeFlag': config.flags.unicode && !config.transform.unicodeFlag,
'hasUnicodeFlag': config.useUnicodeFlag,
'bmpOnly': !config.flags.unicode
};

const groups = {
'onNamedGroup': options && options.onNamedGroup,
'lastIndex': 0,
Expand Down
Loading