Merge pull request #97 from pali/master

Fixes for Encode::utf8
dankogai · Apr 21, 2017 · 3f4f17e · 3f4f17e
2 parents c856e52 + bb60a14
commit 3f4f17e
Show file tree

Hide file tree

Showing 6 changed files with 127 additions and 34 deletions.
diff --git a/Encode.pm b/Encode.pm
@@ -915,12 +915,13 @@ octets that represent the fallback character.  For instance:
 
 Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
 
-Fallback for C<decode> must return decoded string (sequence of characters). So for
+Fallback for C<decode> must return decoded string (sequence of characters)
+and takes a list of ordinal values as its arguments. So for
 example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
 a fallback for bytes that are not valid UTF-8, you could write
 
     $str = decode 'UTF-8', $octets, sub {
-        my $tmp = chr shift;
+        my $tmp = join '', map chr, @_;
         return decode 'ISO-8859-15', $tmp;
     };
 

diff --git a/Encode.xs b/Encode.xs
@@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
 #define SvIV_nomg SvIV
 #endif
 
-#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-#   define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE
-#else
-#   define UTF8_ALLOW_STRICT 0
-#endif
-
-#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY &                    \
-                              ~(UTF8_ALLOW_CONTINUATION |         \
-                                UTF8_ALLOW_NON_CONTINUATION |     \
-                                UTF8_ALLOW_LONG))
-
 static void
 Encode_XSEncoding(pTHX_ encode_t * enc)
 {
@@ -114,6 +103,7 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)
 
 #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
 #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
+#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode"
 
 static SV *
 do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
@@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
     return retval;
 }
 
+static SV *
+do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb)
+{
+    dSP;
+    int argc;
+    STRLEN i;
+    SV *retval = newSVpvn("",0);
+    ENTER;
+    SAVETMPS;
+    PUSHMARK(sp);
+    for (i=0; i<slen; ++i)
+        XPUSHs(sv_2mortal(newSVuv(s[i])));
+    PUTBACK;
+    argc = call_sv(fallback_cb, G_SCALAR);
+    SPAGAIN;
+    if (argc != 1){
+        croak("fallback sub must return scalar!");
+    }
+    sv_catsv(retval, POPs);
+    PUTBACK;
+    FREETMPS;
+    LEAVE;
+    return retval;
+}
+
 static SV *
 encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
 	      int check, STRLEN * offset, SV * term, int * retcode, 
@@ -382,7 +397,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
     U8 *ptr = s;
     bool overflowed = 0;
 
-    uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
+    uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));
 
     len--;
     s++;
@@ -417,6 +432,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
     int check;
     U8 *d;
     STRLEN dlen;
+    char esc[80]; /* need to store UTF8SKIP * 6 + 1 */
+    int i;
 
     if (SvROK(check_sv)) {
 	/* croak("UTF-8 decoder doesn't support callback CHECK"); */
@@ -475,40 +492,56 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
         }
 
         /* If we get here there is something wrong with alleged UTF-8 */
+        /* uv is used only when encoding */
     malformed_byte:
-        uv = (UV)*s;
-        if (ulen == 0)
+        if (uv == 0)
+            uv = (UV)*s;
+        if (encode || ulen == 0)
             ulen = 1;
 
     malformed:
+        if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
+            for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
         if (check & ENCODE_DIE_ON_ERR){
             if (encode)
-                Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
+                Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
             else
-                Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
+                Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
         }
         if (check & ENCODE_WARN_ON_ERR){
             if (encode)
                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                            ERR_ENCODE_NOMAP, uv, "utf8");
+                            ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
             else
                 Perl_warner(aTHX_ packWARN(WARN_UTF8),
-                            ERR_DECODE_NOMAP, "utf8", uv);
+                            ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
         }
         if (check & ENCODE_RETURN_ON_ERR) {
                 break;
         }
         if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
-	    SV* subchar =
-		(fallback_cb != &PL_sv_undef)
-		? do_fallback_cb(aTHX_ uv, fallback_cb)
-		: newSVpvf(check & ENCODE_PERLQQ 
-			   ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
-			   :  check & ENCODE_HTMLCREF ? "&#%" UVuf ";" 
-			   : "&#x%" UVxf ";", uv);
-	    if (encode){
-		SvUTF8_off(subchar); /* make sure no decoded string gets in */
-	    }
+            SV* subchar;
+            if (encode) {
+                subchar =
+                    (fallback_cb != &PL_sv_undef)
+                    ? do_fallback_cb(aTHX_ uv, fallback_cb)
+                    : newSVpvf(check & ENCODE_PERLQQ
+                        ? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
+                        :  check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
+                        : "&#x%" UVxf ";", uv);
+                SvUTF8_off(subchar); /* make sure no decoded string gets in */
+            } else {
+                if (fallback_cb != &PL_sv_undef) {
+                    /* in decode mode we have sequence of wrong bytes */
+                    subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb);
+                } else {
+                    char *ptr = esc;
+                    /* ENCODE_PERLQQ is already stored in esc */
+                    if (check & (ENCODE_HTMLCREF|ENCODE_XMLCREF))
+                        for (i=0; i<ulen; ++i) ptr += sprintf(ptr, ((check & ENCODE_HTMLCREF) ? "&#%u;" : "&#x%02X;"), s[i]);
+                    subchar = newSVpvn(esc, strlen(esc));
+                }
+            }
             dlen += SvCUR(subchar) - ulen;
             SvCUR_set(dst, d-(U8 *)SvPVX(dst));
             *SvEND(dst) = '\0';

diff --git a/MANIFEST b/MANIFEST
@@ -108,6 +108,7 @@ t/rt86327.t	test script
 t/rt113164.t	test script
 t/taint.t       test script
 t/unibench.pl	benchmark script
+t/utf8messages.t test script
 t/utf8ref.t	test script
 t/utf8strict.t	test script
 t/utf8warnings.t test script

diff --git a/t/enc_eucjp.t b/t/enc_eucjp.t
@@ -30,7 +30,7 @@ use encoding 'euc-jp';
 
 my @c = (127, 128, 255, 256);
 
-print "1.." . (scalar @c + 1) . "\n";
+print "1.." . (scalar @c + 2) . "\n";
 
 my @f;
 
@@ -65,7 +65,19 @@ binmode(F, ":raw"); # Output raw bytes.
 print F chr(128); # Output illegal UTF-8.
 close F;
 open(F, $f) or die "$0: failed to open '$f' for reading: $!";
-binmode(F, ":encoding(utf-8)");
+binmode(F, ":encoding(UTF-8)");
+{
+    local $^W = 1;
+    local $SIG{__WARN__} = sub { $a = shift };
+    eval { <F> }; # This should get caught.
+}
+close F;
+print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ?
+  "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n";
+$t++;
+
+open(F, $f) or die "$0: failed to open '$f' for reading: $!";
+binmode(F, ":encoding(utf8)");
 {
     local $^W = 1;
     local $SIG{__WARN__} = sub { $a = shift };
@@ -74,6 +86,7 @@ binmode(F, ":encoding(utf-8)");
 close F;
 print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
   "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
+$t++;
 
 # On VMS temporary file names like "f0." may be more readable than "f0" since
 # "f0" could be a logical name pointing elsewhere.

diff --git a/t/enc_utf8.t b/t/enc_utf8.t
@@ -26,7 +26,7 @@ use encoding 'utf8';
 
 my @c = (127, 128, 255, 256);
 
-print "1.." . (scalar @c + 1) . "\n";
+print "1.." . (scalar @c + 2) . "\n";
 
 my @f;
 
@@ -59,7 +59,19 @@ binmode(F, ":raw"); # Output raw bytes.
 print F chr(128); # Output illegal UTF-8.
 close F;
 open(F, $f) or die "$0: failed to open '$f' for reading: $!";
-binmode(F, ":encoding(utf-8)");
+binmode(F, ":encoding(UTF-8)");
+{
+    local $^W = 1;
+    local $SIG{__WARN__} = sub { $a = shift };
+    eval { <F> }; # This should get caught.
+}
+close F;
+print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ?
+  "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n";
+$t++;
+
+open(F, $f) or die "$0: failed to open '$f' for reading: $!";
+binmode(F, ":encoding(utf8)");
 {
     local $^W = 1;
     local $SIG{__WARN__} = sub { $a = shift };
@@ -68,6 +80,7 @@ binmode(F, ":encoding(utf-8)");
 close F;
 print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
   "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
+$t++;
 
 # On VMS temporary file names like "f0." may be more readable than "f0" since
 # "f0" could be a logical name pointing elsewhere.

diff --git a/t/utf8messages.t b/t/utf8messages.t
@@ -0,0 +1,32 @@
+use strict;
+use warnings;
+
+use Test::More;
+use Encode qw(encode decode FB_CROAK LEAVE_SRC);
+
+plan tests => 12;
+
+my @invalid;
+
+ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8';
+like $@, qr/^"\\x\{d800\}" does not map to UTF-8 /, 'Error message contains strict UTF-8 name';
+@invalid = ();
+encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800';
+
+ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder';
+like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
+@invalid = ();
+decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80';
+
+ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder';
+like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
+@invalid = ();
+decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';
+
+ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder';
+like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence';
+decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; });
+is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';