Skip to content

Commit

Permalink
Merge pull request #97 from pali/master
Browse files Browse the repository at this point in the history
Fixes for Encode::utf8
  • Loading branch information
dankogai committed Apr 21, 2017
2 parents c856e52 + bb60a14 commit 3f4f17e
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 34 deletions.
5 changes: 3 additions & 2 deletions Encode.pm
Original file line number Diff line number Diff line change
Expand Up @@ -915,12 +915,13 @@ octets that represent the fallback character. For instance:
Acts like C<FB_PERLQQ> but U+I<XXXX> is used instead of C<\x{I<XXXX>}>.
Fallback for C<decode> must return decoded string (sequence of characters). So for
Fallback for C<decode> must return decoded string (sequence of characters)
and takes a list of ordinal values as its arguments. So for
example if you wish to decode octets as UTF-8, and use ISO-8859-15 as
a fallback for bytes that are not valid UTF-8, you could write
$str = decode 'UTF-8', $octets, sub {
my $tmp = chr shift;
my $tmp = join '', map chr, @_;
return decode 'ISO-8859-15', $tmp;
};
Expand Down
89 changes: 61 additions & 28 deletions Encode.xs
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32)
#define SvIV_nomg SvIV
#endif

#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE
# define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE
#else
# define UTF8_ALLOW_STRICT 0
#endif

#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \
~(UTF8_ALLOW_CONTINUATION | \
UTF8_ALLOW_NON_CONTINUATION | \
UTF8_ALLOW_LONG))

static void
Encode_XSEncoding(pTHX_ encode_t * enc)
{
Expand Down Expand Up @@ -114,6 +103,7 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify)

#define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s"
#define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode"
#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode"

static SV *
do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
Expand All @@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb)
return retval;
}

static SV *
do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb)
{
dSP;
int argc;
STRLEN i;
SV *retval = newSVpvn("",0);
ENTER;
SAVETMPS;
PUSHMARK(sp);
for (i=0; i<slen; ++i)
XPUSHs(sv_2mortal(newSVuv(s[i])));
PUTBACK;
argc = call_sv(fallback_cb, G_SCALAR);
SPAGAIN;
if (argc != 1){
croak("fallback sub must return scalar!");
}
sv_catsv(retval, POPs);
PUTBACK;
FREETMPS;
LEAVE;
return retval;
}

static SV *
encode_method(pTHX_ const encode_t * enc, const encpage_t * dir, SV * src, U8 * s, STRLEN slen,
int check, STRLEN * offset, SV * term, int * retcode,
Expand Down Expand Up @@ -382,7 +397,7 @@ convert_utf8_multi_seq(U8* s, STRLEN len, STRLEN *rlen)
U8 *ptr = s;
bool overflowed = 0;

uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(len);
uv = NATIVE_TO_UTF(*s) & UTF_START_MASK(UTF8SKIP(s));

len--;
s++;
Expand Down Expand Up @@ -417,6 +432,8 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
int check;
U8 *d;
STRLEN dlen;
char esc[80]; /* need to store UTF8SKIP * 6 + 1 */
int i;

if (SvROK(check_sv)) {
/* croak("UTF-8 decoder doesn't support callback CHECK"); */
Expand Down Expand Up @@ -475,40 +492,56 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
}

/* If we get here there is something wrong with alleged UTF-8 */
/* uv is used only when encoding */
malformed_byte:
uv = (UV)*s;
if (ulen == 0)
if (uv == 0)
uv = (UV)*s;
if (encode || ulen == 0)
ulen = 1;

malformed:
if (!encode && (check & (ENCODE_DIE_ON_ERR|ENCODE_WARN_ON_ERR|ENCODE_PERLQQ)))
for (i=0; i<ulen; ++i) sprintf(esc+4*i, "\\x%02X", s[i]);
if (check & ENCODE_DIE_ON_ERR){
if (encode)
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, "utf8");
Perl_croak(aTHX_ ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
else
Perl_croak(aTHX_ ERR_DECODE_NOMAP, "utf8", uv);
Perl_croak(aTHX_ ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
}
if (check & ENCODE_WARN_ON_ERR){
if (encode)
Perl_warner(aTHX_ packWARN(WARN_UTF8),
ERR_ENCODE_NOMAP, uv, "utf8");
ERR_ENCODE_NOMAP, uv, (strict ? "UTF-8" : "utf8"));
else
Perl_warner(aTHX_ packWARN(WARN_UTF8),
ERR_DECODE_NOMAP, "utf8", uv);
ERR_DECODE_STR_NOMAP, (strict ? "UTF-8" : "utf8"), esc);
}
if (check & ENCODE_RETURN_ON_ERR) {
break;
}
if (check & (ENCODE_PERLQQ|ENCODE_HTMLCREF|ENCODE_XMLCREF)){
SV* subchar =
(fallback_cb != &PL_sv_undef)
? do_fallback_cb(aTHX_ uv, fallback_cb)
: newSVpvf(check & ENCODE_PERLQQ
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
: "&#x%" UVxf ";", uv);
if (encode){
SvUTF8_off(subchar); /* make sure no decoded string gets in */
}
SV* subchar;
if (encode) {
subchar =
(fallback_cb != &PL_sv_undef)
? do_fallback_cb(aTHX_ uv, fallback_cb)
: newSVpvf(check & ENCODE_PERLQQ
? (ulen == 1 ? "\\x%02" UVXf : "\\x{%04" UVXf "}")
: check & ENCODE_HTMLCREF ? "&#%" UVuf ";"
: "&#x%" UVxf ";", uv);
SvUTF8_off(subchar); /* make sure no decoded string gets in */
} else {
if (fallback_cb != &PL_sv_undef) {
/* in decode mode we have sequence of wrong bytes */
subchar = do_bytes_fallback_cb(aTHX_ s, ulen, fallback_cb);
} else {
char *ptr = esc;
/* ENCODE_PERLQQ is already stored in esc */
if (check & (ENCODE_HTMLCREF|ENCODE_XMLCREF))
for (i=0; i<ulen; ++i) ptr += sprintf(ptr, ((check & ENCODE_HTMLCREF) ? "&#%u;" : "&#x%02X;"), s[i]);
subchar = newSVpvn(esc, strlen(esc));
}
}
dlen += SvCUR(subchar) - ulen;
SvCUR_set(dst, d-(U8 *)SvPVX(dst));
*SvEND(dst) = '\0';
Expand Down
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ t/rt86327.t test script
t/rt113164.t test script
t/taint.t test script
t/unibench.pl benchmark script
t/utf8messages.t test script
t/utf8ref.t test script
t/utf8strict.t test script
t/utf8warnings.t test script
Expand Down
17 changes: 15 additions & 2 deletions t/enc_eucjp.t
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use encoding 'euc-jp';

my @c = (127, 128, 255, 256);

print "1.." . (scalar @c + 1) . "\n";
print "1.." . (scalar @c + 2) . "\n";

my @f;

Expand Down Expand Up @@ -65,7 +65,19 @@ binmode(F, ":raw"); # Output raw bytes.
print F chr(128); # Output illegal UTF-8.
close F;
open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf-8)");
binmode(F, ":encoding(UTF-8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
eval { <F> }; # This should get caught.
}
close F;
print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ?
"ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n";
$t++;

open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
Expand All @@ -74,6 +86,7 @@ binmode(F, ":encoding(utf-8)");
close F;
print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
"ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
$t++;

# On VMS temporary file names like "f0." may be more readable than "f0" since
# "f0" could be a logical name pointing elsewhere.
Expand Down
17 changes: 15 additions & 2 deletions t/enc_utf8.t
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use encoding 'utf8';

my @c = (127, 128, 255, 256);

print "1.." . (scalar @c + 1) . "\n";
print "1.." . (scalar @c + 2) . "\n";

my @f;

Expand Down Expand Up @@ -59,7 +59,19 @@ binmode(F, ":raw"); # Output raw bytes.
print F chr(128); # Output illegal UTF-8.
close F;
open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf-8)");
binmode(F, ":encoding(UTF-8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
eval { <F> }; # This should get caught.
}
close F;
print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ?
"ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n";
$t++;

open(F, $f) or die "$0: failed to open '$f' for reading: $!";
binmode(F, ":encoding(utf8)");
{
local $^W = 1;
local $SIG{__WARN__} = sub { $a = shift };
Expand All @@ -68,6 +80,7 @@ binmode(F, ":encoding(utf-8)");
close F;
print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ?
"ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n";
$t++;

# On VMS temporary file names like "f0." may be more readable than "f0" since
# "f0" could be a logical name pointing elsewhere.
Expand Down
32 changes: 32 additions & 0 deletions t/utf8messages.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
use strict;
use warnings;

use Test::More;
use Encode qw(encode decode FB_CROAK LEAVE_SRC);

plan tests => 12;

my @invalid;

ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8';
like $@, qr/^"\\x\{d800\}" does not map to UTF-8 /, 'Error message contains strict UTF-8 name';
@invalid = ();
encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800';

ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder';
like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
@invalid = ();
decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80';

ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder';
like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence';
@invalid = ();
decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';

ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder';
like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence';
decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; });
is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';

0 comments on commit 3f4f17e

Please sign in to comment.