Fix appending correct number of Unicode replacement characters

When truncated UTF-8 sequence was followed by another truncated UTF-8 sequence optionally followed by UTF-8 invariant at the end of string and first sequence in case it would have been not-truncated is longer as current string then the whole sequence was replaced just by one Unicode replacement characters, instead of two (for each invalid/truncated UTF-8 sequence). This happened also in case Encode was called with STOP_AT_PARTIAL flag.
dankogai · Dec 8, 2016 · aac8a70 · aac8a70
1 parent b426e97
commit aac8a70
Show file tree

Hide file tree

Showing 3 changed files with 53 additions and 11 deletions.
diff --git a/Encode.xs b/Encode.xs
@@ -445,18 +445,15 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
         if (UTF8_IS_START(*s)) {
             U8 skip = UTF8SKIP(s);
             if ((s + skip) > e) {
-                if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
-                    const U8 *p = s + 1;
-                    for (; p < e; p++) {
-                        if (!UTF8_IS_CONTINUATION(*p)) {
-                            ulen = p-s;
-                            goto malformed_byte;
-                        }
-                    }
+                /* just calculate ulen, in pathological cases can be smaller then e-s */
+                if (e-s >= 2)
+                    convert_utf8_multi_seq(s, e-s, &ulen);
+                else
+                    ulen = 1;
+
+                if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == e-s)
                     break;
-                }
 
-                ulen = e-s;
                 goto malformed_byte;
             }
 

diff --git a/t/fallback.t b/t/fallback.t
@@ -35,7 +35,7 @@ for my $i (0x80..0xff){
     $uo   .= chr($i);
     $residue    .= chr($i);
     $af .= '?';
-    $uf .= "\x{FFFD}" if $i < 0xfd;
+    $uf .= "\x{FFFD}";
     $ap .= sprintf("\\x{%04x}", $i);
     $up .= sprintf("\\x%02X", $i);
     $ah .= sprintf("&#%d;", $i);

diff --git a/t/truncated_utf8.t b/t/truncated_utf8.t
@@ -0,0 +1,45 @@
+BEGIN {
+    if ($ENV{'PERL_CORE'}) {
+        chdir 't';
+        unshift @INC, '../lib';
+    }
+    require Config; import Config;
+    if ($Config{'extensions'} !~ /\bEncode\b/) {
+      print "1..0 # Skip: Encode was not built\n";
+      exit 0;
+    }
+    if (ord("A") == 193) {
+      print "1..0 # Skip: EBCDIC\n";
+      exit 0;
+    }
+    $| = 1;
+}
+
+use strict;
+use warnings;
+
+use Encode;
+use PerlIO::encoding;
+$PerlIO::encoding::fallback &= ~(Encode::WARN_ON_ERR|Encode::PERLQQ);
+
+use Test::More tests => 6;
+
+is(decode("UTF-8", "\xfd\xfe"), "\x{fffd}" x 2);
+is(decode("UTF-8", "\xfd\xfe\xff"), "\x{fffd}" x 3);
+is(decode("UTF-8", "\xfd\xfe\xff\xe0"), "\x{fffd}" x 4);
+is(decode("UTF-8", "\xfd\xfe\xff\xe0\xe1"), "\x{fffd}" x 5);
+
+my $str = ("x" x 1023) . "\xfd\xfe\xffx";
+open my $fh, '<:encoding(UTF-8)', \$str;
+my $str2 = <$fh>;
+close $fh;
+is($str2, ("x" x 1023) . ("\x{fffd}" x 3) . "x");
+
+TODO: {
+    local $TODO = "bug in perlio";
+    my $str = ("x" x 1023) . "\xfd\xfe\xff";
+    open my $fh, '<:encoding(UTF-8)', \$str;
+    my $str2 = <$fh>;
+    close $fh;
+    is($str2, ("x" x 1023) . ("\x{fffd}" x 3));
+}