From aac8a70b730a4a53ccb7fcdc1a13dc03d235a2e7 Mon Sep 17 00:00:00 2001
From: Pali <pali@cpan.org>
Date: Thu, 8 Dec 2016 23:18:25 +0100
Subject: [PATCH] Fix appending correct number of Unicode replacement
 characters

When truncated UTF-8 sequence was followed by another truncated UTF-8
sequence optionally followed by UTF-8 invariant at the end of string and
first sequence in case it would have been not-truncated is longer as
current string then the whole sequence was replaced just by one Unicode
replacement characters, instead of two (for each invalid/truncated UTF-8
sequence).

This happened also in case Encode was called with STOP_AT_PARTIAL flag.
---
 Encode.xs          | 17 +++++++----------
 t/fallback.t       |  2 +-
 t/truncated_utf8.t | 45 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 11 deletions(-)
 create mode 100644 t/truncated_utf8.t

diff --git a/Encode.xs b/Encode.xs
index b5160d2..d53e60a 100644
--- a/Encode.xs
+++ b/Encode.xs
@@ -445,18 +445,15 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
         if (UTF8_IS_START(*s)) {
             U8 skip = UTF8SKIP(s);
             if ((s + skip) > e) {
-                if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
-                    const U8 *p = s + 1;
-                    for (; p < e; p++) {
-                        if (!UTF8_IS_CONTINUATION(*p)) {
-                            ulen = p-s;
-                            goto malformed_byte;
-                        }
-                    }
+                /* just calculate ulen, in pathological cases can be smaller then e-s */
+                if (e-s >= 2)
+                    convert_utf8_multi_seq(s, e-s, &ulen);
+                else
+                    ulen = 1;
+
+                if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == e-s)
                     break;
-                }
 
-                ulen = e-s;
                 goto malformed_byte;
             }
 
diff --git a/t/fallback.t b/t/fallback.t
index 86605ef..8ef8ab3 100644
--- a/t/fallback.t
+++ b/t/fallback.t
@@ -35,7 +35,7 @@ for my $i (0x80..0xff){
     $uo   .= chr($i);
     $residue    .= chr($i);
     $af .= '?';
-    $uf .= "\x{FFFD}" if $i < 0xfd;
+    $uf .= "\x{FFFD}";
     $ap .= sprintf("\\x{%04x}", $i);
     $up .= sprintf("\\x%02X", $i);
     $ah .= sprintf("&#%d;", $i);
diff --git a/t/truncated_utf8.t b/t/truncated_utf8.t
new file mode 100644
index 0000000..6917ed6
--- /dev/null
+++ b/t/truncated_utf8.t
@@ -0,0 +1,45 @@
+BEGIN {
+    if ($ENV{'PERL_CORE'}) {
+        chdir 't';
+        unshift @INC, '../lib';
+    }
+    require Config; import Config;
+    if ($Config{'extensions'} !~ /\bEncode\b/) {
+      print "1..0 # Skip: Encode was not built\n";
+      exit 0;
+    }
+    if (ord("A") == 193) {
+      print "1..0 # Skip: EBCDIC\n";
+      exit 0;
+    }
+    $| = 1;
+}
+
+use strict;
+use warnings;
+
+use Encode;
+use PerlIO::encoding;
+$PerlIO::encoding::fallback &= ~(Encode::WARN_ON_ERR|Encode::PERLQQ);
+
+use Test::More tests => 6;
+
+is(decode("UTF-8", "\xfd\xfe"), "\x{fffd}" x 2);
+is(decode("UTF-8", "\xfd\xfe\xff"), "\x{fffd}" x 3);
+is(decode("UTF-8", "\xfd\xfe\xff\xe0"), "\x{fffd}" x 4);
+is(decode("UTF-8", "\xfd\xfe\xff\xe0\xe1"), "\x{fffd}" x 5);
+
+my $str = ("x" x 1023) . "\xfd\xfe\xffx";
+open my $fh, '<:encoding(UTF-8)', \$str;
+my $str2 = <$fh>;
+close $fh;
+is($str2, ("x" x 1023) . ("\x{fffd}" x 3) . "x");
+
+TODO: {
+    local $TODO = "bug in perlio";
+    my $str = ("x" x 1023) . "\xfd\xfe\xff";
+    open my $fh, '<:encoding(UTF-8)', \$str;
+    my $str2 = <$fh>;
+    close $fh;
+    is($str2, ("x" x 1023) . ("\x{fffd}" x 3));
+}