diff --git a/Encode.pm b/Encode.pm index 04e0da0..a7b9c2f 100644 --- a/Encode.pm +++ b/Encode.pm @@ -915,12 +915,13 @@ octets that represent the fallback character. For instance: Acts like C but U+I is used instead of C<\x{I}>. -Fallback for C must return decoded string (sequence of characters). So for +Fallback for C must return decoded string (sequence of characters) +and takes a list of ordinal values as its arguments. So for example if you wish to decode octets as UTF-8, and use ISO-8859-15 as a fallback for bytes that are not valid UTF-8, you could write $str = decode 'UTF-8', $octets, sub { - my $tmp = chr shift; + my $tmp = join '', map chr, @_; return decode 'ISO-8859-15', $tmp; }; diff --git a/Encode.xs b/Encode.xs index e3c1bfd..b75fc33 100644 --- a/Encode.xs +++ b/Encode.xs @@ -35,17 +35,6 @@ UNIMPLEMENTED(_encoded_bytes_to_utf8, I32) #define SvIV_nomg SvIV #endif -#ifdef UTF8_DISALLOW_ILLEGAL_INTERCHANGE -# define UTF8_ALLOW_STRICT UTF8_DISALLOW_ILLEGAL_INTERCHANGE -#else -# define UTF8_ALLOW_STRICT 0 -#endif - -#define UTF8_ALLOW_NONSTRICT (UTF8_ALLOW_ANY & \ - ~(UTF8_ALLOW_CONTINUATION | \ - UTF8_ALLOW_NON_CONTINUATION | \ - UTF8_ALLOW_LONG)) - static void Encode_XSEncoding(pTHX_ encode_t * enc) { @@ -114,6 +103,7 @@ utf8_safe_upgrade(pTHX_ SV ** src, U8 ** s, STRLEN * slen, bool modify) #define ERR_ENCODE_NOMAP "\"\\x{%04" UVxf "}\" does not map to %s" #define ERR_DECODE_NOMAP "%s \"\\x%02" UVXf "\" does not map to Unicode" +#define ERR_DECODE_STR_NOMAP "%s \"%s\" does not map to Unicode" static SV * do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) @@ -138,6 +128,31 @@ do_fallback_cb(pTHX_ UV ch, SV *fallback_cb) return retval; } +static SV * +do_bytes_fallback_cb(pTHX_ U8 *s, STRLEN slen, SV *fallback_cb) +{ + dSP; + int argc; + STRLEN i; + SV *retval = newSVpvn("",0); + ENTER; + SAVETMPS; + PUSHMARK(sp); + for (i=0; i }; # This should get caught. +} +close F; +print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ? + "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n"; +$t++; + +open(F, $f) or die "$0: failed to open '$f' for reading: $!"; +binmode(F, ":encoding(utf8)"); { local $^W = 1; local $SIG{__WARN__} = sub { $a = shift }; @@ -74,6 +86,7 @@ binmode(F, ":encoding(utf-8)"); close F; print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; +$t++; # On VMS temporary file names like "f0." may be more readable than "f0" since # "f0" could be a logical name pointing elsewhere. diff --git a/t/enc_utf8.t b/t/enc_utf8.t index b07c573..c5e782c 100644 --- a/t/enc_utf8.t +++ b/t/enc_utf8.t @@ -26,7 +26,7 @@ use encoding 'utf8'; my @c = (127, 128, 255, 256); -print "1.." . (scalar @c + 1) . "\n"; +print "1.." . (scalar @c + 2) . "\n"; my @f; @@ -59,7 +59,19 @@ binmode(F, ":raw"); # Output raw bytes. print F chr(128); # Output illegal UTF-8. close F; open(F, $f) or die "$0: failed to open '$f' for reading: $!"; -binmode(F, ":encoding(utf-8)"); +binmode(F, ":encoding(UTF-8)"); +{ + local $^W = 1; + local $SIG{__WARN__} = sub { $a = shift }; + eval { }; # This should get caught. +} +close F; +print $a =~ qr{^UTF-8 "\\x80" does not map to Unicode} ? + "ok $t - illegal UTF-8 input\n" : "not ok $t - illegal UTF-8 input: a = " . unpack("H*", $a) . "\n"; +$t++; + +open(F, $f) or die "$0: failed to open '$f' for reading: $!"; +binmode(F, ":encoding(utf8)"); { local $^W = 1; local $SIG{__WARN__} = sub { $a = shift }; @@ -68,6 +80,7 @@ binmode(F, ":encoding(utf-8)"); close F; print $a =~ qr{^utf8 "\\x80" does not map to Unicode} ? "ok $t - illegal utf8 input\n" : "not ok $t - illegal utf8 input: a = " . unpack("H*", $a) . "\n"; +$t++; # On VMS temporary file names like "f0." may be more readable than "f0" since # "f0" could be a logical name pointing elsewhere. diff --git a/t/utf8messages.t b/t/utf8messages.t new file mode 100644 index 0000000..8dc1b68 --- /dev/null +++ b/t/utf8messages.t @@ -0,0 +1,32 @@ +use strict; +use warnings; + +use Test::More; +use Encode qw(encode decode FB_CROAK LEAVE_SRC); + +plan tests => 12; + +my @invalid; + +ok ! defined eval { encode('UTF-8', "\x{D800}", FB_CROAK | LEAVE_SRC) }, 'Surrogate codepoint \x{D800} is not encoded to strict UTF-8'; +like $@, qr/^"\\x\{d800\}" does not map to UTF-8 /, 'Error message contains strict UTF-8 name'; +@invalid = (); +encode('UTF-8', "\x{D800}", sub { @invalid = @_; return ""; }); +is_deeply \@invalid, [ 0xD800 ], 'Fallback coderef contains invalid codepoint 0xD800'; + +ok ! defined eval { decode('UTF-8', "\xed\xa0\x80", FB_CROAK | LEAVE_SRC) }, 'Surrogate UTF-8 byte sequence \xED\xA0\x80 is decoded with strict UTF-8 decoder'; +like $@, qr/^UTF-8 "\\xED\\xA0\\x80" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; +@invalid = (); +decode('UTF-8', "\xed\xa0\x80", sub { @invalid = @_; return ""; }); +is_deeply \@invalid, [ 0xED, 0xA0, 0x80 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0, 0x80'; + +ok ! defined eval { decode('UTF-8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with strict UTF-8 decoder'; +like $@, qr/^UTF-8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains strict UTF-8 name and original (not decoded) invalid sequence'; +@invalid = (); +decode('UTF-8', "\xed\xa0", sub { @invalid = @_; return ""; }); +is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0'; + +ok ! defined eval { decode('utf8', "\xed\xa0", FB_CROAK | LEAVE_SRC) }, 'Invalid byte sequence \xED\xA0 is not decoded with non-strict utf8 decoder'; +like $@, qr/^utf8 "\\xED\\xA0" does not map to Unicode /, 'Error message contains non-strict utf8 name and original (not decoded) invalid sequence'; +decode('utf8', "\xed\xa0", sub { @invalid = @_; return ""; }); +is_deeply \@invalid, [ 0xED, 0xA0 ], 'Fallback coderef contains invalid byte sequence 0xED, 0xA0';