Skip to content

Commit

Permalink
Fix appending correct number of Unicode replacement characters
Browse files Browse the repository at this point in the history
When truncated UTF-8 sequence was followed by another truncated UTF-8
sequence optionally followed by UTF-8 invariant at the end of string and
first sequence in case it would have been not-truncated is longer as
current string then the whole sequence was replaced just by one Unicode
replacement characters, instead of two (for each invalid/truncated UTF-8
sequence).

This happened also in case Encode was called with STOP_AT_PARTIAL flag.
  • Loading branch information
pali committed Dec 8, 2016
1 parent b426e97 commit aac8a70
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 11 deletions.
17 changes: 7 additions & 10 deletions Encode.xs
Original file line number Diff line number Diff line change
Expand Up @@ -445,18 +445,15 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
if (UTF8_IS_START(*s)) {
U8 skip = UTF8SKIP(s);
if ((s + skip) > e) {
if (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) {
const U8 *p = s + 1;
for (; p < e; p++) {
if (!UTF8_IS_CONTINUATION(*p)) {
ulen = p-s;
goto malformed_byte;
}
}
/* just calculate ulen, in pathological cases can be smaller then e-s */
if (e-s >= 2)
convert_utf8_multi_seq(s, e-s, &ulen);
else
ulen = 1;

if ((stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL)) && ulen == e-s)
break;
}

ulen = e-s;
goto malformed_byte;
}

Expand Down
2 changes: 1 addition & 1 deletion t/fallback.t
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ for my $i (0x80..0xff){
$uo .= chr($i);
$residue .= chr($i);
$af .= '?';
$uf .= "\x{FFFD}" if $i < 0xfd;
$uf .= "\x{FFFD}";
$ap .= sprintf("\\x{%04x}", $i);
$up .= sprintf("\\x%02X", $i);
$ah .= sprintf("&#%d;", $i);
Expand Down
45 changes: 45 additions & 0 deletions t/truncated_utf8.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
BEGIN {
if ($ENV{'PERL_CORE'}) {
chdir 't';
unshift @INC, '../lib';
}
require Config; import Config;
if ($Config{'extensions'} !~ /\bEncode\b/) {
print "1..0 # Skip: Encode was not built\n";
exit 0;
}
if (ord("A") == 193) {
print "1..0 # Skip: EBCDIC\n";
exit 0;
}
$| = 1;
}

use strict;
use warnings;

use Encode;
use PerlIO::encoding;
$PerlIO::encoding::fallback &= ~(Encode::WARN_ON_ERR|Encode::PERLQQ);

use Test::More tests => 6;

is(decode("UTF-8", "\xfd\xfe"), "\x{fffd}" x 2);
is(decode("UTF-8", "\xfd\xfe\xff"), "\x{fffd}" x 3);
is(decode("UTF-8", "\xfd\xfe\xff\xe0"), "\x{fffd}" x 4);
is(decode("UTF-8", "\xfd\xfe\xff\xe0\xe1"), "\x{fffd}" x 5);

my $str = ("x" x 1023) . "\xfd\xfe\xffx";
open my $fh, '<:encoding(UTF-8)', \$str;
my $str2 = <$fh>;
close $fh;
is($str2, ("x" x 1023) . ("\x{fffd}" x 3) . "x");

TODO: {
local $TODO = "bug in perlio";
my $str = ("x" x 1023) . "\xfd\xfe\xff";
open my $fh, '<:encoding(UTF-8)', \$str;
my $str2 = <$fh>;
close $fh;
is($str2, ("x" x 1023) . ("\x{fffd}" x 3));
}

0 comments on commit aac8a70

Please sign in to comment.