diff --git a/embed.fnc b/embed.fnc index bdebb2a1e56d..b1df6c78fa8f 100644 --- a/embed.fnc +++ b/embed.fnc @@ -919,6 +919,11 @@ CTp |Signal_t|csighandler1 |int sig CTp |Signal_t|csighandler3 |int sig \ |NULLOK Siginfo_t *info \ |NULLOK void *uap +ATdmp |bool |c9strict_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p EXp |regexp_engine const *|current_re_engine RXp |XOPRETANY|custom_op_get_field \ |NN const OP *o \ @@ -1174,6 +1179,11 @@ AOdp |SV * |eval_pv |NN const char *p \ |I32 croak_on_error AOdp |SSize_t|eval_sv |NN SV *sv \ |I32 flags +ATdmp |bool |extended_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p Adfpv |void |fatal_warner |U32 err \ |NN const char *pat \ |... @@ -3070,6 +3080,11 @@ dopx |PerlIO *|start_glob |NN SV *tmpglob \ |NN IO *io Adp |I32 |start_subparse |I32 is_format \ |U32 flags +ATdmp |bool |strict_utf8_to_uv \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p CRp |NV |str_to_version |NN SV *sv : Used in pp_ctl.c p |void |sub_crush_depth|NN CV *cv @@ -3671,12 +3686,11 @@ ARTdmp |U8 * |utf8_hop_safe |NN const U8 *s \ |NN const U8 * const end ARdp |STRLEN |utf8_length |NN const U8 *s0 \ |NN const U8 *e - -AMTdp |UV |utf8n_to_uvchr |NN const U8 *s \ +ATdmp |UV |utf8n_to_uvchr |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ |const U32 flags -AMTdp |UV |utf8n_to_uvchr_error \ +ATdmp |UV |utf8n_to_uvchr_error \ |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ @@ -3689,13 +3703,6 @@ ATdip |UV |utf8n_to_uvchr_msgs \ |const U32 flags \ |NULLOK U32 *errors \ |NULLOK AV **msgs -CTp |UV |_utf8n_to_uvchr_msgs_helper \ - |NN const U8 *s \ - |STRLEN curlen \ - |NULLOK STRLEN *retlen \ - |const U32 flags \ - |NULLOK U32 *errors \ - |NULLOK AV **msgs CDbdp |UV |utf8n_to_uvuni |NN const U8 *s \ |STRLEN curlen \ |NULLOK STRLEN *retlen \ @@ -3725,16 +3732,44 @@ EMXp |U8 * |utf16_to_utf8_reversed \ |NN U8 *d \ |Size_t bytelen \ |NN Size_t *newlen +ATdmp |bool |utf8_to_uv |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p ADbdp |UV |utf8_to_uvchr |NN const U8 *s \ |NULLOK STRLEN *retlen -AMdp |UV |utf8_to_uvchr_buf \ - |NN const U8 *s \ - |NN const U8 *send \ - |NULLOK STRLEN *retlen -Cip |UV |utf8_to_uvchr_buf_helper \ +AMdip |UV |utf8_to_uvchr_buf \ |NN const U8 *s \ |NN const U8 *send \ |NULLOK STRLEN *retlen +ATdmp |bool |utf8_to_uv_errors \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors +ATdmp |bool |utf8_to_uv_flags \ + |NN const U8 * const s \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flag +ATdip |bool |utf8_to_uv_msgs|NN const U8 * const s0 \ + |NN const U8 *e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors \ + |NULLOK AV **msgs +CTp |bool |utf8_to_uv_msgs_helper_ \ + |NN const U8 * const s0 \ + |NN const U8 * const e \ + |NN UV *cp_p \ + |NULLOK Size_t *advance_p \ + |const U32 flags \ + |NULLOK U32 *errors \ + |NULLOK AV **msgs CDbdp |UV |utf8_to_uvuni |NN const U8 *s \ |NULLOK STRLEN *retlen : Used in perly.y diff --git a/embed.h b/embed.h index e0aaa3293cb6..af0c247bb25a 100644 --- a/embed.h +++ b/embed.h @@ -125,7 +125,6 @@ # define _to_utf8_lower_flags(a,b,c,d,e) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e) # define _to_utf8_title_flags(a,b,c,d,e) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e) # define _to_utf8_upper_flags(a,b,c,d,e) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e) -# define _utf8n_to_uvchr_msgs_helper Perl__utf8n_to_uvchr_msgs_helper # define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d) # define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b) # define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d) @@ -158,6 +157,7 @@ # define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d) # define bytes_from_utf8_loc Perl_bytes_from_utf8_loc # define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b) +# define c9strict_utf8_to_uv Perl_c9strict_utf8_to_uv # define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c) # define call_atexit(a,b) Perl_call_atexit(aTHX_ a,b) # define call_list(a,b) Perl_call_list(aTHX_ a,b) @@ -222,6 +222,7 @@ # define dump_vindent(a,b,c,d) Perl_dump_vindent(aTHX_ a,b,c,d) # define eval_pv(a,b) Perl_eval_pv(aTHX_ a,b) # define eval_sv(a,b) Perl_eval_sv(aTHX_ a,b) +# define extended_utf8_to_uv Perl_extended_utf8_to_uv # define fbm_compile(a,b) Perl_fbm_compile(aTHX_ a,b) # define fbm_instr(a,b,c,d) Perl_fbm_instr(aTHX_ a,b,c,d) # define filter_add(a,b) Perl_filter_add(aTHX_ a,b) @@ -645,6 +646,7 @@ # define stack_grow(a,b,c) Perl_stack_grow(aTHX_ a,b,c) # define start_subparse(a,b) Perl_start_subparse(aTHX_ a,b) # define str_to_version(a) Perl_str_to_version(aTHX_ a) +# define strict_utf8_to_uv Perl_strict_utf8_to_uv # define suspend_compcv(a) Perl_suspend_compcv(aTHX_ a) # define sv_2bool_flags(a,b) Perl_sv_2bool_flags(aTHX_ a,b) # define sv_2cv(a,b,c,d) Perl_sv_2cv(aTHX_ a,b,c,d) @@ -797,7 +799,13 @@ # define utf8_hop_safe Perl_utf8_hop_safe # define utf8_length(a,b) Perl_utf8_length(aTHX_ a,b) # define utf8_to_bytes(a,b) Perl_utf8_to_bytes(aTHX_ a,b) -# define utf8_to_uvchr_buf_helper(a,b,c) Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c) +# define utf8_to_uv Perl_utf8_to_uv +# define utf8_to_uv_errors Perl_utf8_to_uv_errors +# define utf8_to_uv_flags Perl_utf8_to_uv_flags +# define utf8_to_uv_msgs Perl_utf8_to_uv_msgs +# define utf8_to_uv_msgs_helper_ Perl_utf8_to_uv_msgs_helper_ +# define utf8n_to_uvchr Perl_utf8n_to_uvchr +# define utf8n_to_uvchr_error Perl_utf8n_to_uvchr_error # define utf8n_to_uvchr_msgs Perl_utf8n_to_uvchr_msgs # define uvchr_to_utf8(a,b) Perl_uvchr_to_utf8(aTHX,a,b) # define uvchr_to_utf8_flags(a,b,c) Perl_uvchr_to_utf8_flags(aTHX,a,b,c) diff --git a/inline.h b/inline.h index f5c2df3a6d6a..8100e6fc390b 100644 --- a/inline.h +++ b/inline.h @@ -1231,17 +1231,6 @@ Perl_append_utf8_from_native_byte(const U8 byte, U8** dest) } } -/* -=for apidoc valid_utf8_to_uvchr -Like C>, but should only be called when it is -known that the next character in the input UTF-8 string C is well-formed -(I, it passes C>. Surrogates, non-character code -points, and non-Unicode code points are allowed. - -=cut - - */ - PERL_STATIC_INLINE UV Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen) { @@ -2053,7 +2042,7 @@ C> (and kin); and if C is C, they give the same results as C> (and kin). Otherwise C may be any combination of the C> flags understood by -C>, with the same meanings. +C>, with the same meanings. It's better to use one of the non-C<_flags> functions if they give you the desired strictness, as those have a better chance of being inlined by the C @@ -2307,7 +2296,7 @@ as C>; and if C is C, this gives the same results as C>. Otherwise C may be any combination of the C> flags -understood by C>, with the same meanings. +understood by C>, with the same meanings. The three alternative macros are for the most commonly needed validations; they are likely to run somewhat faster than this more general one, as they can be @@ -2932,7 +2921,7 @@ C when the latter is called with a zero C parameter. This parameter is used to restrict the classes of code points that are considered to be valid. When zero, Perl's extended UTF-8 is used. Otherwise C can be any combination of the C> -flags accepted by C>. If there is any sequence of bytes +flags accepted by C>. If there is any sequence of bytes that can complete the input partial character in such a way that a non-prohibited character is formed, the function returns TRUE; otherwise FALSE. Non-character code points cannot be determined based on partial character @@ -3004,7 +2993,7 @@ complete code point, this will return TRUE anyway, provided that C> returns TRUE for them. C can be zero or any combination of the C> flags -accepted by C>, and with the same meanings. +accepted by C>, and with the same meanings. The functions differ from C> only in that the latter returns FALSE if the final few bytes of the string don't form a complete code @@ -3049,21 +3038,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s, || is_utf8_valid_partial_char_flags(*ep, s + len, flags); } -PERL_STATIC_INLINE UV -Perl_utf8n_to_uvchr_msgs(const U8 * const s0, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors, - AV ** msgs) +PERL_STATIC_INLINE bool +Perl_utf8_to_uv_msgs(const U8 * const s0, + const U8 * const e, + UV * cp_p, + Size_t *advance_p, + const U32 flags, + U32 * errors, + AV ** msgs) { - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; + PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS; - /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the - * simple cases, and, if necessary calls a helper function to deal with the - * more complex ones. Almost all well-formed non-problematic code points - * are considered simple, so that it's unlikely that the helper function - * will need to be called. */ + /* This is the inlined portion of utf8_to_uv_msgs. It handles the simple + * cases, and, if necessary calls a helper function to deal with the more + * complex ones. Almost all well-formed non-problematic code points are + * considered simple, so that it's unlikely that the helper function will + * need to be called. */ /* Assume that isn't malformed; the vast majority of calls won't be */ if (errors) { @@ -3076,9 +3066,9 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, /* No calls from core pass in an empty string; non-core need a check */ #ifdef PERL_CORE - assert(curlen > 0); + assert(e > s0); #else - if (LIKELY(curlen > 0)) + if (LIKELY(e > s0)) #endif { @@ -3086,15 +3076,15 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, * capable of handling this, but this shortcuts this very common case * */ if (UTF8_IS_INVARIANT(*s0)) { - if (retlen) { - *retlen = 1; + if (advance_p) { + *advance_p = 1; } - return *s0; + *cp_p = *s0; + return true; } const U8 * s = s0; - const U8 * send = s + curlen; /* This dfa is fast. If it accepts the input, it was for a * well-formed, non-problematic code point, which can be returned @@ -3115,7 +3105,7 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, PERL_UINT_FAST8_T state = PL_strict_utf8_dfa_tab[256 + type]; UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s); - while (state > 1 && ++s < send) { + while (state > 1 && ++s < e) { type = PL_strict_utf8_dfa_tab[*s]; state = PL_strict_utf8_dfa_tab[256 + state + type]; @@ -3123,42 +3113,75 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0, } if (LIKELY(state == 0)) { - if (retlen) { - *retlen = s - s0 + 1; + if (advance_p) { + *advance_p = s - s0 + 1; } - return UNI_TO_NATIVE(uv); + *cp_p = UNI_TO_NATIVE(uv); + return true; } } /* Here is potentially problematic. Use the full mechanism */ - return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, - errors, msgs); + return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs); } PERL_STATIC_INLINE UV -Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +Perl_utf8n_to_uvchr_msgs(const U8 * const s0, + STRLEN curlen, + STRLEN *retlen, + const U32 flags, + U32 * errors, + AV ** msgs) { - PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER; + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS; - assert(s < send); + UV cp; + if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors, + msgs))) + { + return cp; + } + + if (flags & UTF8_CHECK_ONLY && retlen) { + *retlen = ((STRLEN) -1); + } + + return 0; +} - if (! ckWARN_d(WARN_UTF8)) { - /* EMPTY is not really allowed, and asserts on debugging builds. But - * on non-debugging we have to deal with it, and this causes it to - * return the REPLACEMENT CHARACTER, as the documentation indicates */ - return utf8n_to_uvchr(s, send - s, retlen, - (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY)); +PERL_STATIC_INLINE UV +Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) +{ + PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; + assert(s < send); + + UV cp; + + /* When everything is legal, just return that; but when not: + * 1) if warnings are enabled return 0 and retlen to -1 + * 2) if warnings are disabled, set 'flags' to accept any malformation, + * but that will just cause the REPLACEMENT CHARACTER to be returned, + * as the documentation indicates. EMPTY is not really allowed, and + * asserts on debugging builds. But on non-debugging we have to deal + * with it. + * This API means 0 can mean a legal NUL, or the input is malformed; and + * the caller has to know if warnings are disabled to know if it can rely on + * 'retlen'. Best to use utf8_to_uv() instead */ + U32 flags = (ckWARN_d(WARN_UTF8)) ? 0 : (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY); + + if ( LIKELY(utf8_to_uv_flags(s, send, &cp, retlen, flags)) + || flags) + { + return cp; } - else { - UV ret = utf8n_to_uvchr(s, send - s, retlen, 0); - if (retlen && ret == 0 && (send <= s || *s != '\0')) { - *retlen = (STRLEN) -1; - } - return ret; + if (retlen) { + *retlen = (STRLEN) -1; } + + return 0; } /* ------------------------------- perl.h ----------------------------- */ diff --git a/mathoms.c b/mathoms.c index 316e0e84665a..0cb7ce32525f 100644 --- a/mathoms.c +++ b/mathoms.c @@ -851,29 +851,6 @@ Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags)); } -/* -=for apidoc_section $unicode -=for apidoc utf8_to_uvchr - -Returns the native code point of the first character in the string C -which is assumed to be in UTF-8 encoding; C will be set to the -length, in bytes, of that character. - -Some, but not all, UTF-8 malformations are detected, and in fact, some -malformed input could cause reading beyond the end of the input buffer, which -is why this function is deprecated. Use L instead. - -If C points to one of the detected malformations, and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value if well-defined (or -the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen> -is set (if C isn't NULL) so that (S + C<*retlen>>) is the -next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is returned. - -=cut -*/ - UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { diff --git a/proto.h b/proto.h index acf8015623d7..db862847a3ea 100644 --- a/proto.h +++ b/proto.h @@ -160,11 +160,6 @@ Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 *e, U8 *ustrp, STRLEN *len #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \ assert(p); assert(ustrp) -PERL_CALLCONV UV -Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors, AV **msgs); -#define PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER \ - assert(s) - PERL_CALLCONV_NO_RET void Perl_abort_execution(pTHX_ SV *msg_sv, const char * const name) __attribute__noreturn__ @@ -418,6 +413,9 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp); #define PERL_ARGS_ASSERT_BYTES_TO_UTF8 \ assert(s); assert(lenp) +/* PERL_CALLCONV bool +Perl_c9strict_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV SSize_t Perl_call_argv(pTHX_ const char *sub_name, I32 flags, char **argv); #define PERL_ARGS_ASSERT_CALL_ARGV \ @@ -1113,6 +1111,9 @@ Perl_eval_sv(pTHX_ SV *sv, I32 flags); #define PERL_ARGS_ASSERT_EVAL_SV \ assert(sv) +/* PERL_CALLCONV bool +Perl_extended_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV void Perl_fatal_warner(pTHX_ U32 err, const char *pat, ...) __attribute__format__(__printf__,pTHX_2,pTHX_3); @@ -4301,6 +4302,9 @@ Perl_str_to_version(pTHX_ SV *sv) #define PERL_ARGS_ASSERT_STR_TO_VERSION \ assert(sv) +/* PERL_CALLCONV bool +Perl_strict_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ + PERL_CALLCONV void Perl_sub_crush_depth(pTHX_ CV *cv) __attribute__visibility__("hidden"); @@ -5165,20 +5169,25 @@ Perl_utf8_to_utf16_base(pTHX_ U8 *s, U8 *d, Size_t bytelen, Size_t *newlen, cons #define PERL_ARGS_ASSERT_UTF8_TO_UTF16_BASE \ assert(s); assert(d); assert(newlen) -PERL_CALLCONV UV -Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); -#define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ - assert(s); assert(send) +/* PERL_CALLCONV bool +Perl_utf8_to_uv(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p); */ -PERL_CALLCONV UV -Perl_utf8n_to_uvchr(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags); -#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR \ - assert(s) +/* PERL_CALLCONV bool +Perl_utf8_to_uv_errors(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors); */ -PERL_CALLCONV UV -Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors); -#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR \ - assert(s) +/* PERL_CALLCONV bool +Perl_utf8_to_uv_flags(const U8 * const s, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flag); */ + +PERL_CALLCONV bool +Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, const U8 * const e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +#define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_ \ + assert(s0); assert(e); assert(cp_p) + +/* PERL_CALLCONV UV +Perl_utf8n_to_uvchr(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags); */ + +/* PERL_CALLCONV UV +Perl_utf8n_to_uvchr_error(const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 *errors); */ PERL_CALLCONV void Perl_utilize(pTHX_ int aver, I32 floor, OP *version, OP *idop, OP *arg) @@ -10099,9 +10108,14 @@ Perl_utf8_hop_overshoot(const U8 *s, SSize_t off, const U8 * const start, const # define PERL_ARGS_ASSERT_UTF8_HOP_OVERSHOOT \ assert(s); assert(start); assert(end) +PERL_STATIC_INLINE bool +Perl_utf8_to_uv_msgs(const U8 * const s0, const U8 *e, UV *cp_p, Size_t *advance_p, const U32 flags, U32 *errors, AV **msgs); +# define PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS \ + assert(s0); assert(e); assert(cp_p) + PERL_STATIC_INLINE UV -Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); -# define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER \ +Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen); +# define PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF \ assert(s); assert(send) PERL_STATIC_INLINE UV diff --git a/utf8.c b/utf8.c index 12b387fa9a45..34e5487e0b12 100644 --- a/utf8.c +++ b/utf8.c @@ -50,7 +50,7 @@ Perl__force_out_malformed_utf8_message(pTHX_ const U8 *const p, /* First byte in UTF-8 sequence */ const U8 * const e, /* Final byte in sequence (may include multiple chars */ - const U32 flags, /* Flags to pass to utf8n_to_uvchr(), + const U32 flags, /* Flags to pass to utf8_to_uv(), usually 0, or some DISALLOW flags */ const bool die_here) /* If TRUE, this function does not return */ { @@ -725,7 +725,7 @@ Perl_is_utf8_char_helper_(const U8 * const s, const U8 * e, const U32 flags) * nowhere else. The function has to cope as best it can if that * sequence does not form a full character. * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags - * accepted by L. If non-zero, this function returns + * accepted by L. If non-zero, this function returns * 0 if it determines the input will match something disallowed. * On output: * The return is the number of bytes required to represent the code point @@ -1000,173 +1000,312 @@ S_unexpected_non_continuation_text(pTHX_ const U8 * const s, /* -=for apidoc utf8n_to_uvchr +=for apidoc utf8_to_uv +=for apidoc_item extended_utf8_to_uv +=for apidoc_item strict_utf8_to_uv +=for apidoc_item c9strict_utf8_to_uv +=for apidoc_item valid_utf8_to_uvchr +=for apidoc_item utf8_to_uvchr_buf +=for apidoc_item utf8_to_uvchr + +These functions each translate from UTF-8 to UTF-32 (or UTF-64 on 64 bit +platforms). In other words, to a code point ordinal value. (On EBCDIC +platforms, the initial encoding is UTF-EBCDIC, and the output is a native code +point). + +For example, the string "A" would be converted to the number 65 on an ASCII +platform, and to 193 on an EBCDIC one. Converting the string "ABC" would yield +the same results, as the functions stop after the first character converted. +Converting the string "\N{LATIN CAPITAL LETTER A WITH MACRON} plus anything +more in the string" would yield the number 0x100 on both types of platforms, +since the first character is U+0100. + +The functions whose names contain C are older than the functions +whose names don't have C in them. The API in the older functions is +harder to use correctly, and so they are kept only for backwards compatibility, +and may eventually become deprecated. If you are writing a module and use +L, your code can use the new functions back to at least Perl +v5.7.1. (C is the exception to this name rule; its API is +not problematic, and it is in no danger of becoming deprecated. But it is +highly specialized so should rarely occur in actual code.) + +All the functions accept, without complaint, well-formed UTF-8 for any +non-problematic Unicode code point 0 .. 0x10FFFF. There are two types of +Unicode problematic code points: surrogate characters and non-character code +points. (See L.) Some of the functions reject one or both of +these. Private use characters and those code points yet to be assigned to a +particular character are never considered problematic. Additionally, most of +the functions accept non-Unicode code points, those starting at 0x110000. -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -Bottom level UTF-8 decode routine. -Returns the native code point value of the first character in the string C, -which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than -C bytes; C<*retlen> (if C isn't NULL) will be set to -the length, in bytes, of that character. - -The value of C determines the behavior when either C does not point -to a well-formed UTF-8 character, or the pointed-to code point is a member of -certain potentially problematic classes (listed below). If C is 0, all -such classes are accepted, and encountering a malformation causes zero to be -returned and C<*retlen> to be set so that (S + C<*retlen>>) is the next -possible position in C that could begin a non-malformed character. For -malformations, if UTF-8 warnings haven't been lexically disabled, a warning is -also raised. Some UTF-8 input sequences may contain multiple malformations. -This function tries to find every possible one in each call, so multiple -warnings can be raised for the same sequence. - -Various ALLOW flags can be set in C to allow (and not warn on) -individual types of malformations, such as the sequence being overlong (that -is, there is a shorter sequence that can express the same code point; overlong -sequences are expressly forbidden in the UTF-8 standard due to potential -security issues). Another malformation example is the first byte of the input -sequence not being a legal first byte. See F for the list of such -flags. Even if allowed, this function generally returns the Unicode -REPLACEMENT CHARACTER when it encounters a malformation. There are flags in -F to override this behavior for the overlong malformations, but don't -do that except for very specialized purposes. - -The C flag overrides the behavior when a non-allowed (by other -flags) malformation is found. If this flag is set, the routine assumes that -the caller will raise a warning, and this function will silently just set -C to C<-1> (cast to C) and return zero. - -Note that this API requires disambiguation between successful decoding a C -character, and an error return (unless the C flag is set), as -in both cases, 0 is returned, and, depending on the malformation, C may -be set to -1. To disambiguate, upon a zero return, see if the first byte of -C is 0 as well. If so, the input was a C; if not, the input had an -error. Or you can use C>. - -Certain classes of code points are considered problematic. These are Unicode -surrogates, Unicode non-characters, and code points above the Unicode maximum -of 0x10FFFF. By default these are considered regular code points, but certain -situations warrant special handling for them, which can be specified using the -C parameter. If C contains C, -all three classes are treated as malformations and handled as such. The flags -C, C, and -C (meaning above the legal Unicode maximum) can be set to -disallow these categories individually. C -restricts the allowed inputs to the strict UTF-8 traditionally defined by -Unicode. Use C to use the strictness -definition given by -L. -The difference between traditional strictness and C9 strictness is that the -latter does not forbid non-character code points. (They are still discouraged, -however.) For more discussion see L. - -The flags C, -C, C, -C, and C will cause warning messages to be -raised for their respective categories, but otherwise the code points are -considered valid (not malformations). To get a category to both be treated as -a malformation and raise a warning, specify both the WARN and DISALLOW flags. -(But note that warnings are not raised if lexically disabled nor if -C is also specified.) +=over 4 -Extremely high code points were never specified in any standard, and require an -extension to UTF-8 to express, which Perl does. It is likely that programs -written in something other than Perl would not be able to read files that -contain these; nor would Perl understand files written by something that uses a -different extension. For these reasons, there is a separate set of flags that -can warn and/or disallow these extremely high code points, even if other -above-Unicode ones are accepted. They are the C and -C flags. For more information see -C>. Of course C will treat all -above-Unicode code points, including these, as malformations. -(Note that the Unicode standard considers anything above 0x10FFFF to be -illegal, but there are standards predating it that allow up to 0x7FFF_FFFF -(2**31 -1)) - -A somewhat misleadingly named synonym for C is -retained for backward compatibility: C. Similarly, -C is usable instead of the more accurately named -C. The names are misleading because these flags -can apply to code points that actually do fit in 31 bits. This happens on -EBCDIC platforms, and sometimes when the L> is also present. The new names accurately -describe the situation in all cases. - -All other code points corresponding to Unicode characters, including private -use and those yet to be assigned, are never considered malformed and never -warn. +=item C forms + +Almost all code should use only C, C, +C, or C. The other functions are +either the problematic old form, or are for highly specialized uses. + +These four functions each return C if the sequence of bytes starting at +C form a complete, legal UTF-8 (or UTF-EBCDIC) sequence for a code point. +If so, C<*cp> will be set to the native code point value it represents, and +C<*advance> will be set to its length, in bytes. + +Otherwise, each function returns C and sets C<*cp> to the Unicode +REPLACEMENT CHARACTER, and C<*advance> to the next position along C, where +the next possible UTF-8 character could begin. + +The functions only examine as many bytes along C as are needed to form a +complete UTF-8 representation of a single code point. Under no circumstances +do they examine any byte beyond S>, failing if the code point +requires more than S> bytes to represent. + +The functions differ only in what flavor of UTF-8 they accept. All reject +syntactically invalid UTF-8. C additionally rejects any +UTF-8 that translates into a code point that isn't specified by Unicode to be +freely exchangeable, namely the surrogate characters and non-character code +points. C instead uses the exchangeable definition given +by Unicode's Corregendum #9, which rejects only surrogates. +C accepts all syntactically valid UTF-8, as extended by +Perl to allow 64-bit code points to be encoded. + +C is merely a synonym of C whose name +explicitly indicates that it accepts Perl-extended UTF-8. Perl programs +traditionally handle this by default. + +Whenever input is rejected, an explanatory warning message is raised, unless +C warnings (or the appropriate subcategory) are turned off. A given +input sequence may contain multiple malformations, giving rise to multiple +warnings, as the functions attempt to find and report on all malformations in a +sequence. All the possible malformations are listed in C>, +with some examples of multiple ones for the same sequence. + +Often, C is an arbitrarily long string containing the UTF-8 representations +of many code points in a row, and these functions are called in the course of +parsing C to find all those code points. + +If your code doesn't know how to deal with illegal input, as would be typical +of a low level routine, the loop could look like: + + while (s < e) { + UV cp; + Size_t advance; + (void) utf8_to_uv(s, e, &cp, &advance); + + s += advance; + } + +A REPLACEMENT CHARACTER will be inserted everywhere that malformed input +occurs. Obviously, we aren't expecting such outcomes, but your code will be +protected from going off the rails. + +If you do have a plan for handling malformed input, you could instead write: + + while (s < e) { + UV cp; + Size_t advance; + + if (UNLIKELY(! utf8_to_uv(s, e, &cp, &advance)) { + + } + + + + s += advance; + } + +You may pass NULL to these functions instead of a pointer to your C +variable. But the only legitimate case to do this is if you are only examining +the first character in C, and have no plans to ever look further. You could +also advance by using C, but this gives the correct result if and +only if the input is well-formed; and is extra work always, as the functions +have already done the equivalent work and return the correct value in +C, regardless of whether the input is well-formed or not. + +You must always pass a non-NULL pointer into which to store the (first) code +point C represents. If you don't care about this value, you should be using +one of the C> functions instead. + +=item Function where the UTF-8 is B to be valid + +C is designed +to be used where you generated the UTF-8 yourself, so you know it is valid. +It skips any error checking, assuming the sequence of bytes starting at C is +encoded as Perl extended UTF-8 (or Perl extended UTF-EBCDIC), reading as many +bytes along C as necessary, and returning that count in C<*retlen> (if +C is not NULL). + +=item C forms + +These are the old form equivalents of C (and its synonym, +C). They are C and C. +There is no old form equivalent of either C nor +C. + +C is DEPRECATED. Do NOT use it; it is a security hole ready to +bring destruction onto you and yours. C is discouraged and +may eventually become deprecated + +C checks if the sequence of bytes starting at C form a +complete, legal UTF-8 (or UTF-EBCDIC) sequence for a code point. If so, it +returns the code point value the sequence represents, and C<*retlen> will be +set to its length, in bytes. Thus, the next possible character in C begins +at S>. + +The function only examines as many bytes along C as are needed to form a +complete UTF-8 representation of a single code point. Under no circumstances +does it examine any byte beyond S>. + +If the sequence examined starting at C is not legal Perl extended UTF-8, the +translation fails, and the resultant behavior unfortunately depends on if the +warnings category "utf8" is enabled or not. -=for apidoc Amnh||UTF8_CHECK_ONLY -=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE -=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE -=for apidoc Amnh||UTF8_DISALLOW_SURROGATE -=for apidoc Amnh||UTF8_DISALLOW_NONCHAR -=for apidoc Amnh||UTF8_DISALLOW_SUPER -=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE -=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE -=for apidoc Amnh||UTF8_WARN_SURROGATE -=for apidoc Amnh||UTF8_WARN_NONCHAR -=for apidoc Amnh||UTF8_WARN_SUPER -=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED -=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED +=over 4 -=cut +=item If C<'utf8'> warnings are disabled -Also implemented as a macro in utf8.h -*/ +The Unicode REPLACEMENT CHARACTER is silently returned, and C<*retlen> is set +(if C isn't C) so that (S + C<*retlen>>) is the next +possible position in C that could begin a non-malformed character. -UV -Perl_utf8n_to_uvchr(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; +But note that it is ambiguous whether a REPLACEMENT CHARACTER was actually in +the input, or if this function synthetically generated one. In the unlikely +event that you care, you'd have to examine the input to disambiguate. - return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL); -} +=item If C<'utf8'> warnings are enabled -/* +A warning will be displayed, and 0 is returned and C<*retlen> is set (if +C isn't C) to -1. -=for apidoc utf8n_to_uvchr_error +But note that 0 may also be returned if S<*s> is a legal NUL character. This +means that you have to disambiguate a 0 return. You can do this by checking +that the first byte of C is indeed a NUL; or by making sure to always pass a +non-NULL C pointer, and by examining it. -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -This function is for code that needs to know what the precise malformation(s) -are when an error is found. If you also need to know the generated warning -messages, use L() instead. - -It is like C> but it takes an extra parameter placed after -all the others, C. If this parameter is 0, this function behaves -identically to C>. Otherwise, C should be a pointer -to a C variable, which this function sets to indicate any errors found. -Upon return, if C<*errors> is 0, there were no errors found. Otherwise, -C<*errors> is the bit-wise C of the bits described in the list below. Some -of these bits will be set if a malformation is found, even if the input -C parameter indicates that the given malformation is allowed; those -exceptions are noted: +Also note that should you wish to proceed with parsing C, you have no easy +way of knowing where to start looking in it for the next possible character. +It would be better to have instead called an equivalent function that provides +this information; any of the C series, or C>. + +=back + +Because of these quirks, C is very difficult to use +correctly and handle all cases. Generally, you need to bail out at the first +failure it finds. + +The deprecated C behaves the same way as C for +well-formed input, and for the malformations it is capable of finding, but +doesn't find all of them, and it can read beyond the end of the input buffer, +which is why it is deprecated. + +=back + +The bottom line is use the C family of functions. + +=for apidoc utf8_to_uv_flags +=for apidoc_item utf8n_to_uvchr + +These functions are generalized versions of C>, where you need +more control over what UTF-8 sequences are acceptable. These functions are +unlikely to be needed except for specialized purposes. + +C is more like a generalization of C, but +with fewer quirks, and a different method of specifying the bytes it is allowed +to examine in C. It has a C parameter instead of a C parameter, +so the furthest byte in C it can look at is S>. Its return +value is, like C, ambiguous with respect to the NUL and +REPLACEMENT characters, but the value of C<*retlen> can be relied on (except +with the C flag described below) to know where the next +possible character along C starts, removing that quirk. Hence, you always +should use C<*retlen> to determine where the next character in C starts. + +These functions have an additional parameter, C, besides the ones in +C and C, which can be used to broaden or +restrict what is acceptable UTF-8. C has the same meaning and behavior +in both functions. When C is 0, these functions accept any +syntactically valid Perl-extended-UTF-8 sequence. + +There are flags that apply to accepting particular sequences, and flags that +apply to raising warnings about encountering sequences. Each type is +independent of the other. You can reject and not warn; warn and still accept; +or both reject and warn. Rejecting means that the sequence gets translated +into the Unicode REPLACEMENT CHARACTER instead of what it was meant to +represent. + +Even if a flag is passed that indicates warnings are desired; no warning will be +raised if C<'utf8'> warnings (or the appropriate subcategory) are disabled at +the point of the call. =over 4 -=item C +=item C -The input sequence is not standard UTF-8, but a Perl extension. This bit is -set only if the input C parameter contains either the -C or the C flags. +This also suppresses any warnings. And it changes what is stored into +C<*retlen> with the C family of functions (for the worse). It is not +likely to be of use to you. You can use C (described below) to +also turn off warnings, and that flag doesn't adversely affect C<*retlen>. + +=item C -Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard, -and so some extension must be used to express them. Perl uses a natural -extension to UTF-8 to represent the ones up to 2**36-1, and invented a further -extension to represent even higher ones, so that any code point that fits in a -64-bit word can be represented. Text using these extensions is not likely to -be portable to non-Perl code. We lump both of these extensions together and -refer to them as Perl extended UTF-8. There exist other extensions that people -have invented, incompatible with Perl's. +=item C + +These disallow and/or warn about UTF-8 sequences that represent surrogate +characters. + +=item C + +=item C + +These disallow and/or warn about UTF-8 sequences that represent non-character +code points. + +=item C + +=item C + +These disallow and/or warn about UTF-8 sequences that represent code points +above 0x10FFFF. + +=item C + +=item C + +These are the same as having selected all three of the corresponding SURROGATE, +NONCHAR and SUPER flags listed above. + +All such code points are not considered to be safely freely exchangeable +between processes. + +=item C + +=item C + +These are the same as having selected both the corresponding SURROGATE and +SUPER flags listed above. + +Unicode issued L to allow non-character +code points to be exchanged by processes aware of the possibility. (They are +still discouraged, however.) For more discussion see +L. + +=item C + +=item C + +These disallow and/or warn on encountering sequences that require Perl's +extension to UTF-8 to represent them. These are all for code points above +0x10FFFF, so these sequences are a subset of the ones controlled by SUPER or +either of the illegal interchange sets of flags. + +Perl predates Unicode, and earlier standards allowed for code points up through +0x7FFF_FFFF (2**31 - 1). Perl, of course, would like you to be able to +represent in UTF-8 any code point available on the platform. To do so, some +extension must be used to express them. Perl uses a natural extension to UTF-8 +to represent the ones up to 2**36-1, and invented a further extension to +represent even higher ones, so that any code point that fits in a 64-bit word +can be represented. We lump both of these extensions together and refer to +them as Perl extended UTF-8. There exist other extensions that people have +invented, incompatible with Perl's. On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower @@ -1174,29 +1313,79 @@ than on ASCII. Prior to that, code points 2**31 and higher were simply unrepresentable, and a different, incompatible method was used to represent code points between 2**30 and 2**31 - 1. -On both platforms, ASCII and EBCDIC, C is set if -Perl extended UTF-8 is used. +It is likely that programs written in something other than Perl would not be +able to read files that contain these; nor would Perl understand files written +by something that uses a different extension. Hence, you can specify that +above-Unicode code points are generally accepted and/or warned about, but still +exclude the ones that require this extension to represent. + +=item C and kin + +Other flags can be passed to suppress warnings for syntactic malformations +and/or overflowing the number of bits available in a UV on the platform. All +such malformations translate to the REPLACEMENT CHARACTER, regardless of +any of the flags. They, contrary to their names, only control the warnings. -In earlier Perls, this bit was named C, which you still -may use for backward compatibility. That name is misleading, as this flag may -be set when the code point actually does fit in 31 bits. This happens on -EBCDIC platforms, and sometimes when the L> is also present. The new name accurately -describes the situation in all cases. +The only one that you would ever have any reason to use is C +which suppresses the warnings for any of the syntactic malformations and +overflow, except for empty input. + +The other such flags are shown in the C<_GOT_> bits list in +C>. + +All such flags have C<_ALLOW_> in their names, which is misleading. The name +stems from a time when earlier perl versions kind-of, sort-of tried, mostly +unsuccessfully, to accept these malformations in input strings. That ended by +Perl v5.14. + +=back + +=for apidoc utf8_to_uv_msgs +=for apidoc_item utf8n_to_uvchr_msgs +=for apidoc_item utf8_to_uv_errors +=for apidoc_item utf8n_to_uvchr_error + +These functions are generalizations of C> and +C>. They are used for the highly specialized purpose of +when the caller needs to know the exact malformations that were encountered +and/or the diagnostics that would be raised. + +They each take one or two extra parameters, pointers to where to store this +information. The functions with C<_msgs> in their names return both types, so +take two extra parameters; those with C<_error> return just the malformations, +so take just one extra parameter. When the extra parameters are 0, the +functions behave identically to the function they generalize. + +When the C parameter is not NULL, it should be the address of a U32 +variable, into which the functions store a bitmap, described just below, with a +bit set for each malformation the function found; 0 if none. What is +considered a malformation is affected by C, the same as in +C. + +The bits returned in C and their meanings are: + +=over 4 =item C The input sequence was malformed in that the first byte was a UTF-8 -continuation byte. +continuation byte. This bit is not set if the C flag +is set. =item C -The input C parameter was 0. +The input parameters indicated the length of C is 0. Technically, this a +coding error, not a malformation; you should check before calling these +functions if there is actually anything to convert. But perl needs to be able +to recover from bad input, and this is how it does it. + +This bit is not set if the C flag is set. =item C The input sequence was malformed in that there is some other sequence that evaluates to the same code point, but that sequence is shorter than this one. +This bit is not set if the C flag is set. Until Unicode 3.1, it was legal for programs to accept this malformation, but it was discovered that this created security issues. @@ -1212,19 +1401,26 @@ C or the C flags. The input sequence was malformed in that a non-continuation type byte was found in a position where only a continuation type one should be. See also -C>. +C>. This bit is not set if the +C flag is set. =item C The input sequence was malformed in that it is for a code point that is not representable in the number of bits available in an IV on the current platform. +This bit is not set if the C flag is set. + +=item C + +The input sequence is not standard UTF-8, but a Perl extension. This bit is +set only if the input C parameter contains either the +C or the C flags. =item C The input sequence was malformed in that C is smaller than required for a complete sequence. In other words, the input is for a partial character -sequence. - +sequence. This bit is not set if the C flag is set. C and C both indicate a too short sequence. The difference is that C indicates always @@ -1233,12 +1429,12 @@ sequence was looked at. If no other flags are present, it means that the sequence was valid as far as it went. Depending on the application, this could mean one of three things: -=over +=over 4 =item * -The C length parameter passed in was too small, and the function was -prevented from examining all the necessary bytes. +The C or C parameters passed in were too small, and the function +was prevented from examining all the necessary bytes. =item * @@ -1269,105 +1465,109 @@ C or the C flags. =back -To do your own error handling, call this function with the C -flag to suppress any warnings, and then examine the C<*errors> return. +Note that more than one bit may have been set by these functions. This is +because it is possible for multiple malformations to be present in the same +sequence. An example would be an overlong sequence evaluating to a surrogate +when surrogates are forbidden. -=for apidoc Amnh||UTF8_GOT_PERL_EXTENDED -=for apidoc Amnh||UTF8_GOT_CONTINUATION -=for apidoc Amnh||UTF8_GOT_EMPTY -=for apidoc Amnh||UTF8_GOT_LONG -=for apidoc Amnh||UTF8_GOT_NONCHAR -=for apidoc Amnh||UTF8_GOT_NON_CONTINUATION -=for apidoc Amnh||UTF8_GOT_OVERFLOW -=for apidoc Amnh||UTF8_GOT_SHORT -=for apidoc Amnh||UTF8_GOT_SUPER -=for apidoc Amnh||UTF8_GOT_SURROGATE - -=cut - -Also implemented as a macro in utf8.h -*/ +If you don't care about the system's messages text nor warning categories, you +can customize error handling by calling one of the C<_error> functions, using +either of the flags C or C to suppress any +warnings, and then examine the C<*errors> return. -UV -Perl_utf8n_to_uvchr_error(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors) -{ - PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR; +But if you do care, use one of the functions with C<_msgs> in their names. +These allow you to completely customize error handling. They take a second +extra parameter, C. If the flag C is passed, this +parameter is ignored. Otherwise, if not NULL, it should be a pointer to a +variable which has been declared to be an C, and into which the function +creates a new AV to store information, described below, about all malformations +the function found; NULL is stored if none. What is considered a malformation +is affected by C, the same as described in C>. - return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL); -} - -/* - -=for apidoc utf8n_to_uvchr_msgs - -THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES. -Most code should use L() rather than call this -directly. - -This function is for code that needs to know what the precise malformation(s) -are when an error is found, and wants the corresponding warning and/or error -messages to be returned to the caller rather than be displayed. All messages -that would have been displayed if all lexical warnings are enabled will be -returned. - -It is just like C> but it takes an extra parameter -placed after all the others, C. If this parameter is 0, this function -behaves identically to C>. Otherwise, C should -be a pointer to an C variable, in which this function creates a new AV to -contain any appropriate messages. The elements of the array are ordered so -that the first message that would have been displayed is in the 0th element, -and so on. Each element is a hash with three key-value pairs, as follows: +Each element of the C AV array is an anonymous hash with the following +three key-value pairs: =over 4 =item C -The text of the message as a C. +A C containing the text of any warning message that would have ordinarily +been generated. The function suppresses raising this warning itself. =item C -The warning category (or categories) packed into a C. +The warning category (or categories) for the message, packed into a C. =item C -A single flag bit associated with this message, in a C. -The bit corresponds to some bit in the C<*errors> return value, -such as C. +A C containing a single flag bit associated with this message. The bit +corresponds to some bit in the C<*errors> return value, such as +C. =back -It's important to note that specifying this parameter as non-null will cause -any warnings this function would otherwise generate to be suppressed, and -instead be placed in C<*msgs>. The caller can check the lexical warnings state -(or not) when choosing what to do with the returned messages. +The array is sorted so that element C<[0]> contains the first message that +would have otherwise been raised; C<[1]>, the second; and so on. -If the flag C is passed, no warnings are generated, and hence -no AV is created. +You thus can completely override the normal error handling; you can check the +lexical warnings state (or not) when choosing what to do with the returned +messages. The caller, of course, is responsible for freeing any returned AV. +=for apidoc Amnh||UTF8_ALLOW_CONTINUATION +=for apidoc Amnh||UTF8_ALLOW_EMPTY +=for apidoc Amnh||UTF8_ALLOW_LONG +=for apidoc Amnh||UTF8_ALLOW_NON_CONTINUATION +=for apidoc Amnh||UTF8_ALLOW_OVERFLOW +=for apidoc Amnh||UTF8_ALLOW_PERL_EXTENDED +=for apidoc Amnh||UTF8_ALLOW_SHORT +=for apidoc Amnh||UTF8_CHECK_ONLY +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_DISALLOW_NONCHAR +=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED +=for apidoc Amnh||UTF8_DISALLOW_SUPER +=for apidoc Amnh||UTF8_DISALLOW_SURROGATE +=for apidoc Amnh||UTF8_GOT_CONTINUATION +=for apidoc Amnh||UTF8_GOT_EMPTY +=for apidoc Amnh||UTF8_GOT_LONG +=for apidoc Amnh||UTF8_GOT_NONCHAR +=for apidoc Amnh||UTF8_GOT_NON_CONTINUATION +=for apidoc Amnh||UTF8_GOT_OVERFLOW +=for apidoc Amnh||UTF8_GOT_PERL_EXTENDED +=for apidoc Amnh||UTF8_GOT_SHORT +=for apidoc Amnh||UTF8_GOT_SUPER +=for apidoc Amnh||UTF8_GOT_SURROGATE +=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE +=for apidoc Amnh||UTF8_WARN_NONCHAR +=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED +=for apidoc Amnh||UTF8_WARN_SUPER +=for apidoc Amnh||UTF8_WARN_SURROGATE + =cut */ -UV -Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, - STRLEN curlen, - STRLEN *retlen, - const U32 flags, - U32 * errors, - AV ** msgs) +bool +Perl_utf8_to_uv_msgs_helper_(const U8 * const s0, + const U8 * const e, + UV *cp_p, + Size_t *advance_p, + const U32 flags, + U32 * errors, + AV ** msgs) { - const U8 * const s0 = s; - const U8 * send = s0 + curlen; + PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS_HELPER_; + + const U8 * s = s0; + const U8 * send = e; + SSize_t curlen = send - s0; U32 possible_problems; /* A bit is set here for each potential problem found as we go along */ UV uv; - STRLEN expectlen; /* How long should this sequence be? */ - STRLEN avail_len; /* When input is too short, gives what that is */ + SSize_t expectlen; /* How long should this sequence be? */ + SSize_t avail_len; /* When input is too short, gives what that is */ U32 discard_errors; /* Used to save branches when 'errors' is NULL; this gets set and discarded */ @@ -1380,8 +1580,6 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, UV uv_so_far; dTHX; - PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER; - /* Here, is one of: a) malformed; b) a problematic code point (surrogate, * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul * syllables that the dfa doesn't properly handle. Quickly dispose of the @@ -1390,8 +1588,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Each of the affected Hanguls starts with \xED */ if (is_HANGUL_ED_utf8_safe(s0, send)) { /* Always false on EBCDIC */ - if (retlen) { - *retlen = 3; + if (advance_p) { + *advance_p = 3; } if (errors) { *errors = 0; @@ -1400,9 +1598,10 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, *msgs = NULL; } - return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) - | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) - | (s0[2] & UTF_CONTINUATION_MASK); + *cp_p = ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + return true; } /* In conjunction with the exhaustive tests that can be enabled in @@ -1445,7 +1644,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * We also should not consume too few bytes, otherwise someone could inject * things. For example, an input could be deliberately designed to * overflow, and if this code bailed out immediately upon discovering that, - * returning to the caller C<*retlen> pointing to the very next byte (one + * returning to the caller C<*advance_p> pointing to the very next byte (one * which is actually part of the overflowing sequence), that could look * legitimate to the caller, which could discard the initial partial * sequence and process the rest, inappropriately. @@ -1457,7 +1656,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * allowed one, we could allow in something that shouldn't have been. */ - if (UNLIKELY(curlen == 0)) { + if (UNLIKELY(curlen <= 0)) { possible_problems |= UTF8_GOT_EMPTY; curlen = 0; uv = UNICODE_REPLACEMENT; @@ -1472,8 +1671,8 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * function will be for, has this expected length. For efficiency, set * things up here to return it. It will be overridden only in those rare * cases where a malformation is found */ - if (retlen) { - *retlen = expectlen; + if (advance_p) { + *advance_p = expectlen; } /* A continuation character can't start a valid sequence */ @@ -1551,7 +1750,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, * we must look at the UTF-8 byte sequence itself to see if it is for an * overlong */ if ( ( LIKELY(! possible_problems) - && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv))) + && UNLIKELY(expectlen > (SSize_t) OFFUNISKIP(uv))) || ( UNLIKELY(possible_problems) && ( UNLIKELY(! UTF8_IS_START(*s0)) || (UNLIKELY(0 < is_utf8_overlong(s0, s - s0)))))) @@ -1567,7 +1766,7 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))) { UV min_uv = uv_so_far; - STRLEN i; + SSize_t i; /* Here, the input is both overlong and is missing some trailing * bytes. There is no single code point it could be for, but there @@ -2060,50 +2259,17 @@ Perl__utf8n_to_uvchr_msgs_helper(const U8 *s, /* Since there was a possible problem, the returned length may need to * be changed from the one stored at the beginning of this function. * Instead of trying to figure out if it has changed, just do it. */ - if (retlen) { - *retlen = curlen; + if (advance_p) { + *advance_p = curlen; } if (disallowed) { - if (flags & UTF8_CHECK_ONLY && retlen) { - *retlen = ((STRLEN) -1); - } - return 0; + return false; } } - return UNI_TO_NATIVE(uv); -} - -/* -=for apidoc utf8_to_uvchr_buf - -Returns the native code point of the first character in the string C which -is assumed to be in UTF-8 encoding; C points to 1 beyond the end of C. -C<*retlen> will be set to the length, in bytes, of that character. - -If C does not point to a well-formed UTF-8 character and UTF8 warnings are -enabled, zero is returned and C<*retlen> is set (if C isn't -C) to -1. If those warnings are off, the computed value, if well-defined -(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and -C<*retlen> is set (if C isn't C) so that (S + C<*retlen>>) is -the next possible position in C that could begin a non-malformed character. -See L for details on when the REPLACEMENT CHARACTER is -returned. - -=cut - -Also implemented as a macro in utf8.h - -*/ - - -UV -Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) -{ - PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF; - - return utf8_to_uvchr_buf_helper(s, send, retlen); + *cp_p = UNI_TO_NATIVE(uv); + return true; } /* diff --git a/utf8.h b/utf8.h index 72de6816818b..455891494319 100644 --- a/utf8.h +++ b/utf8.h @@ -143,12 +143,33 @@ typedef enum { Perl_uvchr_to_utf8_flags_msgs(aTHX, d, u, f, 0) #define Perl_uvchr_to_utf8_flags_msgs(mTHX, d, u, f , m) \ Perl_uvoffuni_to_utf8_flags_msgs(aTHX_ d, NATIVE_TO_UNI(u), f, m) -#define utf8_to_uvchr_buf(s, e, lenp) \ - utf8_to_uvchr_buf_helper((const U8 *) (s), (const U8 *) e, lenp) -#define utf8n_to_uvchr(s, len, lenp, flags) \ - utf8n_to_uvchr_error(s, len, lenp, flags, 0) -#define utf8n_to_uvchr_error(s, len, lenp, flags, errors) \ - utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0) + +/* This is needed to cast the parameters for all those calls that had them + * improperly as chars */ +#define utf8_to_uvchr_buf(s, e, lenp) \ + Perl_utf8_to_uvchr_buf(aTHX_ (const U8 *) (s), (const U8 *) e, lenp) + +#define Perl_utf8n_to_uvchr(s, len, lenp, flags) \ + Perl_utf8n_to_uvchr_error(s, len, lenp, flags, 0) +#define Perl_utf8n_to_uvchr_error(s, len, lenp, flags, errors) \ + Perl_utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0) + +#define Perl_utf8_to_uv( s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, 0) +#define Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, flags) \ + Perl_utf8_to_uv_errors( s, e, cp_p, advance_p, flags, 0) +#define Perl_utf8_to_uv_errors( s, e, cp_p, advance_p, flags, errors) \ + Perl_utf8_to_uv_msgs( s, e, cp_p, advance_p, flags, errors, 0) +#define Perl_extended_utf8_to_uv(s, e, cp_p, advance_p) \ + Perl_utf8_to_uv(s, e, cp_p, advance_p) +#define Perl_strict_utf8_to_uv( s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, \ + ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE \ + | UTF8_WARN_ILLEGAL_INTERCHANGE)) +#define Perl_c9strict_utf8_to_uv(s, e, cp_p, advance_p) \ + Perl_utf8_to_uv_flags( s, e, cp_p, advance_p, \ + ( UTF8_DISALLOW_ILLEGAL_INTERCHANGE \ + | UTF8_WARN_ILLEGAL_INTERCHANGE)) #define utf16_to_utf8(p, d, bytelen, newlen) \ utf16_to_utf8_base(p, d, bytelen, newlen, 0, 1)