Perl · khwilliamson · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/embed.fnc b/embed.fnc
@@ -919,6 +919,11 @@ CTp	|Signal_t|csighandler1	|int sig
 CTp	|Signal_t|csighandler3	|int sig				\
 				|NULLOK Siginfo_t *info 		\
 				|NULLOK void *uap
+ATdmp	|bool	|c9strict_utf8_to_uv					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p
 EXp	|regexp_engine const *|current_re_engine
 RXp	|XOPRETANY|custom_op_get_field					\
 				|NN const OP *o 			\
@@ -1174,6 +1179,11 @@ AOdp	|SV *	|eval_pv	|NN const char *p			\
 				|I32 croak_on_error
 AOdp	|SSize_t|eval_sv	|NN SV *sv				\
 				|I32 flags
+ATdmp	|bool	|extended_utf8_to_uv					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p
 Adfpv	|void	|fatal_warner	|U32 err				\
 				|NN const char *pat			\
 				|...
@@ -3070,6 +3080,11 @@ dopx	|PerlIO *|start_glob	|NN SV *tmpglob 			\
 				|NN IO *io
 Adp	|I32	|start_subparse |I32 is_format				\
 				|U32 flags
+ATdmp	|bool	|strict_utf8_to_uv					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p
 CRp	|NV	|str_to_version |NN SV *sv
 : Used in pp_ctl.c
 p	|void	|sub_crush_depth|NN CV *cv
@@ -3671,12 +3686,11 @@ ARTdmp	|U8 *	|utf8_hop_safe	|NN const U8 *s 			\
 				|NN const U8 * const end
 ARdp	|STRLEN |utf8_length	|NN const U8 *s0			\
 				|NN const U8 *e
-
-AMTdp	|UV	|utf8n_to_uvchr |NN const U8 *s 			\
+ATdmp	|UV	|utf8n_to_uvchr |NN const U8 *s 			\
 				|STRLEN curlen				\
 				|NULLOK STRLEN *retlen			\
 				|const U32 flags
-AMTdp	|UV	|utf8n_to_uvchr_error					\
+ATdmp	|UV	|utf8n_to_uvchr_error					\
 				|NN const U8 *s 			\
 				|STRLEN curlen				\
 				|NULLOK STRLEN *retlen			\
@@ -3689,13 +3703,6 @@ ATdip	|UV	|utf8n_to_uvchr_msgs					\
 				|const U32 flags			\
 				|NULLOK U32 *errors			\
 				|NULLOK AV **msgs
-CTp	|UV	|_utf8n_to_uvchr_msgs_helper				\
-				|NN const U8 *s 			\
-				|STRLEN curlen				\
-				|NULLOK STRLEN *retlen			\
-				|const U32 flags			\
-				|NULLOK U32 *errors			\
-				|NULLOK AV **msgs
 CDbdp	|UV	|utf8n_to_uvuni |NN const U8 *s 			\
 				|STRLEN curlen				\
 				|NULLOK STRLEN *retlen			\
@@ -3725,16 +3732,44 @@ EMXp	|U8 *	|utf16_to_utf8_reversed 				\
 				|NN U8 *d				\
 				|Size_t bytelen 			\
 				|NN Size_t *newlen
+ATdmp	|bool	|utf8_to_uv	|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p
 ADbdp	|UV	|utf8_to_uvchr	|NN const U8 *s 			\
 				|NULLOK STRLEN *retlen
-AMdp	|UV	|utf8_to_uvchr_buf					\
-				|NN const U8 *s 			\
-				|NN const U8 *send			\
-				|NULLOK STRLEN *retlen
-Cip	|UV	|utf8_to_uvchr_buf_helper				\
+AMdip	|UV	|utf8_to_uvchr_buf					\
 				|NN const U8 *s 			\
 				|NN const U8 *send			\
 				|NULLOK STRLEN *retlen
+ATdmp	|bool	|utf8_to_uv_errors					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flags			\
+				|NULLOK U32 *errors
+ATdmp	|bool	|utf8_to_uv_flags					\
+				|NN const U8 * const s			\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flag
+ATdip	|bool	|utf8_to_uv_msgs|NN const U8 * const s0 		\
+				|NN const U8 *e 			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flags			\
+				|NULLOK U32 *errors			\
+				|NULLOK AV **msgs
+CTp	|bool	|utf8_to_uv_msgs_helper_				\
+				|NN const U8 * const s0 		\
+				|NN const U8 * const e			\
+				|NN UV *cp_p				\
+				|NULLOK Size_t *advance_p		\
+				|const U32 flags			\
+				|NULLOK U32 *errors			\
+				|NULLOK AV **msgs
 CDbdp	|UV	|utf8_to_uvuni	|NN const U8 *s 			\
 				|NULLOK STRLEN *retlen
 : Used in perly.y

diff --git a/embed.h b/embed.h
@@ -125,7 +125,6 @@
 # define _to_utf8_lower_flags(a,b,c,d,e)        Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e)
 # define _to_utf8_title_flags(a,b,c,d,e)        Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e)
 # define _to_utf8_upper_flags(a,b,c,d,e)        Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e)
-# define _utf8n_to_uvchr_msgs_helper            Perl__utf8n_to_uvchr_msgs_helper
 # define amagic_call(a,b,c,d)                   Perl_amagic_call(aTHX_ a,b,c,d)
 # define amagic_deref_call(a,b)                 Perl_amagic_deref_call(aTHX_ a,b)
 # define apply_attrs_string(a,b,c,d)            Perl_apply_attrs_string(aTHX_ a,b,c,d)
@@ -158,6 +157,7 @@
 # define bytes_cmp_utf8(a,b,c,d)                Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
 # define bytes_from_utf8_loc                    Perl_bytes_from_utf8_loc
 # define bytes_to_utf8(a,b)                     Perl_bytes_to_utf8(aTHX_ a,b)
+# define c9strict_utf8_to_uv                    Perl_c9strict_utf8_to_uv
 # define call_argv(a,b,c)                       Perl_call_argv(aTHX_ a,b,c)
 # define call_atexit(a,b)                       Perl_call_atexit(aTHX_ a,b)
 # define call_list(a,b)                         Perl_call_list(aTHX_ a,b)
@@ -222,6 +222,7 @@
 # define dump_vindent(a,b,c,d)                  Perl_dump_vindent(aTHX_ a,b,c,d)
 # define eval_pv(a,b)                           Perl_eval_pv(aTHX_ a,b)
 # define eval_sv(a,b)                           Perl_eval_sv(aTHX_ a,b)
+# define extended_utf8_to_uv                    Perl_extended_utf8_to_uv
 # define fbm_compile(a,b)                       Perl_fbm_compile(aTHX_ a,b)
 # define fbm_instr(a,b,c,d)                     Perl_fbm_instr(aTHX_ a,b,c,d)
 # define filter_add(a,b)                        Perl_filter_add(aTHX_ a,b)
@@ -645,6 +646,7 @@
 # define stack_grow(a,b,c)                      Perl_stack_grow(aTHX_ a,b,c)
 # define start_subparse(a,b)                    Perl_start_subparse(aTHX_ a,b)
 # define str_to_version(a)                      Perl_str_to_version(aTHX_ a)
+# define strict_utf8_to_uv                      Perl_strict_utf8_to_uv
 # define suspend_compcv(a)                      Perl_suspend_compcv(aTHX_ a)
 # define sv_2bool_flags(a,b)                    Perl_sv_2bool_flags(aTHX_ a,b)
 # define sv_2cv(a,b,c,d)                        Perl_sv_2cv(aTHX_ a,b,c,d)
@@ -797,7 +799,13 @@
 # define utf8_hop_safe                          Perl_utf8_hop_safe
 # define utf8_length(a,b)                       Perl_utf8_length(aTHX_ a,b)
 # define utf8_to_bytes(a,b)                     Perl_utf8_to_bytes(aTHX_ a,b)
-# define utf8_to_uvchr_buf_helper(a,b,c)        Perl_utf8_to_uvchr_buf_helper(aTHX_ a,b,c)
+# define utf8_to_uv                             Perl_utf8_to_uv
+# define utf8_to_uv_errors                      Perl_utf8_to_uv_errors
+# define utf8_to_uv_flags                       Perl_utf8_to_uv_flags
+# define utf8_to_uv_msgs                        Perl_utf8_to_uv_msgs
+# define utf8_to_uv_msgs_helper_                Perl_utf8_to_uv_msgs_helper_
+# define utf8n_to_uvchr                         Perl_utf8n_to_uvchr
+# define utf8n_to_uvchr_error                   Perl_utf8n_to_uvchr_error
 # define utf8n_to_uvchr_msgs                    Perl_utf8n_to_uvchr_msgs
 # define uvchr_to_utf8(a,b)                     Perl_uvchr_to_utf8(aTHX,a,b)
 # define uvchr_to_utf8_flags(a,b,c)             Perl_uvchr_to_utf8_flags(aTHX,a,b,c)

diff --git a/inline.h b/inline.h
@@ -1231,17 +1231,6 @@ Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
     }
 }
 
-/*
-=for apidoc valid_utf8_to_uvchr
-Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
-known that the next character in the input UTF-8 string C<s> is well-formed
-(I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
-points, and non-Unicode code points are allowed.
-
-=cut
-
- */
-
 PERL_STATIC_INLINE UV
 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
 {
@@ -2053,7 +2042,7 @@ C<L</is_strict_utf8_string>> (and kin); and if C<flags> is
 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, they give the same results as
 C<L</is_c9strict_utf8_string>> (and kin).  Otherwise C<flags> may be any
 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
-C<L</utf8n_to_uvchr>>, with the same meanings.
+C<L</utf8_to_uv>>, with the same meanings.
 
 It's better to use one of the non-C<_flags> functions if they give you the
 desired strictness, as those have a better chance of being inlined by the C
@@ -2307,7 +2296,7 @@ as C<L</isSTRICT_UTF8_CHAR>>;
 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
-understood by C<L</utf8n_to_uvchr>>, with the same meanings.
+understood by C<L</utf8_to_uv>>, with the same meanings.
 
 The three alternative macros are for the most commonly needed validations; they
 are likely to run somewhat faster than this more general one, as they can be
@@ -2932,7 +2921,7 @@ C<is_utf8_valid_partial_char_flags> when the latter is called with a zero
 C<flags> parameter.  This parameter is used to restrict the classes of code
 points that are considered to be valid.  When zero, Perl's extended UTF-8 is
 used.  Otherwise C<flags> can be any combination of the C<UTF8_DISALLOW_I<foo>>
-flags accepted by C<L</utf8n_to_uvchr>>.  If there is any sequence of bytes
+flags accepted by C<L</utf8_to_uv>>.  If there is any sequence of bytes
 that can complete the input partial character in such a way that a
 non-prohibited character is formed, the function returns TRUE; otherwise FALSE.
 Non-character code points cannot be determined based on partial character
@@ -3004,7 +2993,7 @@ complete code point, this will return TRUE anyway, provided that
 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
 
 C<flags> can be zero or any combination of the C<UTF8_DISALLOW_I<foo>> flags
-accepted by C<L</utf8n_to_uvchr>>, and with the same meanings.
+accepted by C<L</utf8_to_uv>>, and with the same meanings.
 
 The functions differ from C<L</is_utf8_string_flags>> only in that the latter
 returns FALSE if the final few bytes of the string don't form a complete code
@@ -3049,21 +3038,22 @@ Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
 }
 
-PERL_STATIC_INLINE UV
-Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
-                         STRLEN curlen,
-                         STRLEN *retlen,
-                         const U32 flags,
-                         U32 * errors,
-                         AV ** msgs)
+PERL_STATIC_INLINE bool
+Perl_utf8_to_uv_msgs(const U8 * const s0,
+                     const U8 * const e,
+                     UV * cp_p,
+                     Size_t *advance_p,
+                     const U32 flags,
+                     U32 * errors,
+                     AV ** msgs)
 {
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+    PERL_ARGS_ASSERT_UTF8_TO_UV_MSGS;
 
-    /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
-     * simple cases, and, if necessary calls a helper function to deal with the
-     * more complex ones.  Almost all well-formed non-problematic code points
-     * are considered simple, so that it's unlikely that the helper function
-     * will need to be called. */
+    /* This is the inlined portion of utf8_to_uv_msgs.  It handles the simple
+     * cases, and, if necessary calls a helper function to deal with the more
+     * complex ones.  Almost all well-formed non-problematic code points are
+     * considered simple, so that it's unlikely that the helper function will
+     * need to be called. */
 
     /* Assume that isn't malformed; the vast majority of calls won't be */
     if (errors) {
@@ -3076,25 +3066,25 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
 
     /* No calls from core pass in an empty string; non-core need a check */
 #ifdef PERL_CORE
-    assert(curlen > 0);
+    assert(e > s0);
 #else
-    if (LIKELY(curlen > 0))
+    if (LIKELY(e > s0))
 #endif
 
     {
         /* UTF-8 invariants are returned unchanged.  The code below is quite
          * capable of handling this, but this shortcuts this very common case
          * */
         if (UTF8_IS_INVARIANT(*s0)) {
-            if (retlen) {
-                *retlen = 1;
+            if (advance_p) {
+                *advance_p = 1;
             }
 
-            return *s0;
+            *cp_p = *s0;
+            return true;
         }
 
         const U8 * s = s0;
-        const U8 * send = s + curlen;
 
         /* This dfa is fast.  If it accepts the input, it was for a
          * well-formed, non-problematic code point, which can be returned
@@ -3115,50 +3105,83 @@ Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
         PERL_UINT_FAST8_T state = PL_strict_utf8_dfa_tab[256 + type];
         UV uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
 
-        while (state > 1 && ++s < send) {
+        while (state > 1 && ++s < e) {
             type  = PL_strict_utf8_dfa_tab[*s];
             state = PL_strict_utf8_dfa_tab[256 + state + type];
 
             uv = UTF8_ACCUMULATE(uv, *s);
         }
 
         if (LIKELY(state == 0)) {
-            if (retlen) {
-                *retlen = s - s0 + 1;
+            if (advance_p) {
+                *advance_p = s - s0 + 1;
             }
 
-            return UNI_TO_NATIVE(uv);
+            *cp_p = UNI_TO_NATIVE(uv);
+            return true;
         }
     }
 
     /* Here is potentially problematic.  Use the full mechanism */
-    return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
-                                       errors, msgs);
+    return utf8_to_uv_msgs_helper_(s0, e, cp_p, advance_p, flags, errors, msgs);
 }
 
 PERL_STATIC_INLINE UV
-Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+Perl_utf8n_to_uvchr_msgs(const U8 * const s0,
+                         STRLEN curlen,
+                         STRLEN *retlen,
+                         const U32 flags,
+                         U32 * errors,
+                         AV ** msgs)
 {
-    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
 
-    assert(s < send);
+    UV cp;
+    if (LIKELY(utf8_to_uv_msgs(s0, s0 + curlen, &cp, retlen, flags, errors,
+                                                                        msgs)))
+    {
+        return cp;
+    }
+
+    if (flags & UTF8_CHECK_ONLY && retlen) {
+        *retlen = ((STRLEN) -1);
+    }
+
+    return 0;
+}
 
-    if (! ckWARN_d(WARN_UTF8)) {
 
-        /* EMPTY is not really allowed, and asserts on debugging builds.  But
-         * on non-debugging we have to deal with it, and this causes it to
-         * return the REPLACEMENT CHARACTER, as the documentation indicates */
-        return utf8n_to_uvchr(s, send - s, retlen,
-                              (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
+PERL_STATIC_INLINE UV
+Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
+    assert(s < send);
+
+    UV cp;
+
+    /* When everything is legal, just return that; but when not:
+     *  1) if warnings are enabled return 0 and retlen to -1
+     *  2) if warnings are disabled, set 'flags' to accept any malformation,
+     *     but that will just cause the REPLACEMENT CHARACTER to be returned,
+     *     as the documentation indicates.  EMPTY is not really allowed, and
+     *     asserts on debugging builds.  But on non-debugging we have to deal
+     *     with it.
+     * This API means 0 can mean a legal NUL, or the input is malformed; and
+     * the caller has to know if warnings are disabled to know if it can rely on
+     * 'retlen'.  Best to use utf8_to_uv() instead */
+    U32 flags = (ckWARN_d(WARN_UTF8)) ? 0 : (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY);
+
+    if (   LIKELY(utf8_to_uv_flags(s, send, &cp, retlen, flags))
+        || flags)
+    {
+        return cp;
     }
-    else {
-        UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
-        if (retlen && ret == 0 && (send <= s || *s != '\0')) {
-            *retlen = (STRLEN) -1;
-        }
 
-        return ret;
+    if (retlen) {
+        *retlen = (STRLEN) -1;
     }
+
+    return 0;
 }
 
 /* ------------------------------- perl.h ----------------------------- */