From aac8a70b730a4a53ccb7fcdc1a13dc03d235a2e7 Mon Sep 17 00:00:00 2001 From: Pali Date: Thu, 8 Dec 2016 23:18:25 +0100 Subject: [PATCH] Fix appending correct number of Unicode replacement characters When truncated UTF-8 sequence was followed by another truncated UTF-8 sequence optionally followed by UTF-8 invariant at the end of string and first sequence in case it would have been not-truncated is longer as current string then the whole sequence was replaced just by one Unicode replacement characters, instead of two (for each invalid/truncated UTF-8 sequence). This happened also in case Encode was called with STOP_AT_PARTIAL flag. --- Encode.xs | 17 +++++++---------- t/fallback.t | 2 +- t/truncated_utf8.t | 45 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 t/truncated_utf8.t diff --git a/Encode.xs b/Encode.xs index b5160d2..d53e60a 100644 --- a/Encode.xs +++ b/Encode.xs @@ -445,18 +445,15 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv, if (UTF8_IS_START(*s)) { U8 skip = UTF8SKIP(s); if ((s + skip) > e) { - if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) { - const U8 *p = s + 1; - for (; p < e; p++) { - if (!UTF8_IS_CONTINUATION(*p)) { - ulen = p-s; - goto malformed_byte; - } - } + /* just calculate ulen, in pathological cases can be smaller then e-s */ + if (e-s >= 2) + convert_utf8_multi_seq(s, e-s, &ulen); + else + ulen = 1; + + if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == e-s) break; - } - ulen = e-s; goto malformed_byte; } diff --git a/t/fallback.t b/t/fallback.t index 86605ef..8ef8ab3 100644 --- a/t/fallback.t +++ b/t/fallback.t @@ -35,7 +35,7 @@ for my $i (0x80..0xff){ $uo .= chr($i); $residue .= chr($i); $af .= '?'; - $uf .= "\x{FFFD}" if $i < 0xfd; + $uf .= "\x{FFFD}"; $ap .= sprintf("\\x{%04x}", $i); $up .= sprintf("\\x%02X", $i); $ah .= sprintf("&#%d;", $i); diff --git a/t/truncated_utf8.t b/t/truncated_utf8.t new file mode 100644 index 0000000..6917ed6 --- /dev/null +++ b/t/truncated_utf8.t @@ -0,0 +1,45 @@ +BEGIN { + if ($ENV{'PERL_CORE'}) { + chdir 't'; + unshift @INC, '../lib'; + } + require Config; import Config; + if ($Config{'extensions'} !~ /\bEncode\b/) { + print "1..0 # Skip: Encode was not built\n"; + exit 0; + } + if (ord("A") == 193) { + print "1..0 # Skip: EBCDIC\n"; + exit 0; + } + $| = 1; +} + +use strict; +use warnings; + +use Encode; +use PerlIO::encoding; +$PerlIO::encoding::fallback &= ~(Encode::WARN_ON_ERR|Encode::PERLQQ); + +use Test::More tests => 6; + +is(decode("UTF-8", "\xfd\xfe"), "\x{fffd}" x 2); +is(decode("UTF-8", "\xfd\xfe\xff"), "\x{fffd}" x 3); +is(decode("UTF-8", "\xfd\xfe\xff\xe0"), "\x{fffd}" x 4); +is(decode("UTF-8", "\xfd\xfe\xff\xe0\xe1"), "\x{fffd}" x 5); + +my $str = ("x" x 1023) . "\xfd\xfe\xffx"; +open my $fh, '<:encoding(UTF-8)', \$str; +my $str2 = <$fh>; +close $fh; +is($str2, ("x" x 1023) . ("\x{fffd}" x 3) . "x"); + +TODO: { + local $TODO = "bug in perlio"; + my $str = ("x" x 1023) . "\xfd\xfe\xff"; + open my $fh, '<:encoding(UTF-8)', \$str; + my $str2 = <$fh>; + close $fh; + is($str2, ("x" x 1023) . ("\x{fffd}" x 3)); +}