properly replace the surrogate pairs for cases when the regex has a u…

… switch but there is no u-switch support in the runtime's RegExp
JazzleWare · Jul 7, 2016 · 1ef83e5 · 1ef83e5
1 parent afea857
commit 1ef83e5
Showing 1 changed file with 56 additions and 3 deletions.
diff --git a/lube.js b/lube.js
@@ -90,6 +90,8 @@ var CHAR_1 = char2int('1'),
     CHAR_MULTI_QUOTE = char2int('"'),
     CHAR_BACK_SLASH = char2int(('\\')),
 
+    CHAR_BACKSPACE = char2int('\b'),
+
     CHAR_DIV = char2int('/'),
     CHAR_MUL = char2int('*'),
     CHAR_MIN = char2int('-'),
@@ -146,6 +148,45 @@ try {
 catch(r) {
 }
 
+var hexD = [ '1', '2', '3', '4', '5',
+             '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' ];
+hexD = ['0'].concat(hexD);
+
+function hex(number) {
+  var str = "";
+  str = hexD[number&0xf] + str
+  str = hexD[(number>>=4)&0xf] + str ;
+  str = hexD[(number>>=4)&0xf] + str ;
+  str = hexD[(number>>=4)&0xf] + str ;
+
+  return str;
+}
+
+function curlyReplace(matchedString, b, matchIndex, wholeString ) {
+  var c = parseInt( '0x' + b );
+  if ( c <= 0xFFFF ) return '\\u' + hex(c);
+  return '\\uFFFF';
+}
+
+function regexReplace(matchedString, b, noB, matchIndex, wholeString) {
+  var c = parseInt('0x' + ( b || noB ) ) ;
+  _assert(c <= 0x010FFFF );
+
+  if ( c <= 0xFFFF ) return String.fromCharCode(c) ;
+
+  c -= 0x010000;
+  return '\uFFFF';
+} 
+
+function verifyRegex(regex, flags) {
+  var regexVal = null;
+
+  try {
+    return new RegExp(regex, flags);
+  } catch ( e ) { throw e; }
+
+}
+
 var Num,num = Num = function (c) { return (c >= CHAR_0 && c <= CHAR_9)};
 function isIDHead(c) {
   return (c <= CHAR_z && c >= CHAR_a) ||
@@ -2296,7 +2337,7 @@ lp. parseIdStatementOrIdExpressionOrId = function ( context ) {
 
   return pendingExprHead;
 };
-
+           
 lp.parseRegExpLiteral = function() {
      var startc = this.c - 1, startLoc = this.locOn(1),
          c = this.c, src = this.src, len = src.length;
@@ -2353,15 +2394,27 @@ lp.parseRegExpLiteral = function() {
      var patternString = src.slice(this.c, c-flagCount-1 ), flagsString = src .slice(c-flagCount,c);
      var val = null;
 
+     var normalizedRegex = patternString;
+
+     // those that contain a 'u' flag need special treatment when RegExp constructor they get sent to
+     // doesn't support the 'u' flag: since they can have surrogate pair sequences (which are not allowed without the 'u' flag),
+     // they must be checked for having such surrogate pairs, and should replace them with a character that is valid even
+     // without being in the context of a 'u' 
+     if ( (flags & uRegexFlag) && !(regexFlagsSupported & uRegexFlag) )
+          normalizedRegex = normalizedRegex.replace( /\\u\{([A-F0-9a-f]+)\}/g, curlyReplace) // normalize curlies
+             .replace( /\\u([A-F0-9a-f][A-F0-9a-f][A-F0-9a-f][A-F0-9a-f])/g, regexReplace ) // convert u
+             .replace( /[\ud800-\udbff][\udc00-\udfff]/g, '\uFFFF' );
+
+
      // all of the 1 bits in flags must also be 1 in the same bit index in regexsupportedFlags;
      // flags ^ rsf returns a bit set in which the 1 bits mean "this flag is either not used in flags, or yt is not supported";
      // for knowing whether the 1 bit has also been 1 in flags, we '&' the above bit set with flags; the 1 bits in the
      // given bit set must both be 1 in flags and in flags ^ rsf; that is, they are both "used" and "unsupoorted or unused",
      // which would be equal to this: [used && (unsupported || !used)] == unsopprted
      if ( flags & (regexFlagsSupported^flags) )
-       new RegExp(patternString);
+        verifyRegex(normalizedRegex, "");
      else
-       val = new RegExp( patternString, flagsString ) ;
+        val = verifyRegex( patternString, flagsString ) ;
 
      this.col += (c-this.c);
      var regex = { type: 'Literal', regex: { pattern: patternString, flags: flagsString },